Example #1
0
    def parse_sub_company_list(self, response):
        company = response.meta['company']

        sub_company_list = list()
        for entry in response.xpath('//div[@class="ge"]/ul/li'):
            name = get_content(entry.xpath('string(p[1])').extract())
            address = get_content(entry.xpath('string(p[2])').extract())
            phone = get_content(entry.xpath('string(p[3])').extract())
            sub_company_list.append({
                'name': name,
                'address': address,
                'phone': phone
            })

        company['sub_company_list'] = json.dumps(sub_company_list,
                                                 encoding="UTF-8",
                                                 ensure_ascii=False)

        # 当前保险产品信息
        yield scrapy.FormRequest(
            url='http://icid.iachina.cn/ICID/front/viewAllPros.do',
            method='POST',
            formdata={
                'columnid': company['column_id'],
                'internetInformationNo': company['info_no'],
                'informationno': company['info_no'],
                'zj': company['zj']
            },
            meta={
                'company': company,
                'type': 'cur'
            },
            callback=self.parse_product_list,
            dont_filter=True)
Example #2
0
    def parse_company_detail(self, response):
        company = response.meta['company']
        data_dict = dict()
        for entry in response.xpath('//div[@class="jie_nei"]/ul/li'):
            key = get_content(entry.xpath('p[1]/text()').extract())
            value = get_content(entry.xpath('string(p[2])').extract())
            data_dict[key] = value

        company['detail_info'] = json.dumps(data_dict,
                                            encoding="UTF-8",
                                            ensure_ascii=False)

        # 二级分公司信息
        yield scrapy.FormRequest(
            url='http://icid.iachina.cn/ICID/front/viewAllBranch.do',
            method='POST',
            formdata={
                'columnid': company['column_id'],
                'internetInformationNo': company['info_no'],
                'informationno': company['info_no'],
                'zj': company['zj']
            },
            meta={'company': company},
            callback=self.parse_sub_company_list,
            dont_filter=True)
    def parse(self, response):
        attr_type = response.url.split('/')[-1]

        symbol = (self.mapping.get(response.url), self.url_attr_map[attr_type],
                  response.url)
        self.logger.info('Parsing ID.%d 39health Symptom %s Info From <%s>.' %
                         symbol)

        item = SymptomDetailInfoItem()
        item['s_id'] = symbol[0]
        if attr_type == '':
            try:
                item['name'] = get_content(
                    response.xpath('//h1/text()').extract())
                item['description'] = get_content(
                    response.xpath('//dd[@id="intro"]/p/text()').extract())
            except:
                pass
        else:
            try:
                item[self.url_attr_db_map[attr_type]] = \
                    get_content(response.xpath('//div[@class="item catalogItem"]').extract())
            except:
                pass

        return item
Example #4
0
    def parse(self, response):
        tc = self.get_thread_category_from_url(response.url)
        if not tc[0] or not tc[1]:
            self.logger.warning("Invalid Wangjia News Item From <%s>." % response.url)
            return None

        symbol = (tc[0], self.tab[tc[1]], response.url)
        if response.xpath('//div[@id="messagetext" and @class="alert_info"]'):
            self.logger.warning("No.%s Wangjia News %s Item From <%s> Maybe Limited." % symbol)
            return None

        self.logger.info("Parsing No.%s Wangjia News %s Item From <%s>." % symbol)

        item = XinwenItem()
        item["thread"] = int(symbol[0])
        item["category_id"] = tc[1]
        item["source"] = symbol[2]

        article = response.xpath('//div[@class="con_news"]')
        item["title"] = get_content(article.xpath("h1/text()").extract())

        subtitle = article.xpath('ul/li[@class="n_time"]/text()').extract()[0].encode("utf8").split(":")
        item["created"] = get_content(subtitle[1].split())
        item["author"] = get_content(subtitle[-1].split())
        item["summary"] = get_content(article.xpath('ul/li[@class="a_abstract"]/span/text()').extract())

        body = article.xpath('ul/li[@class="news_con_p"]')
        item["content"] = "".join([get_trunk(c) for c in body.xpath(".//text()").extract()])
        item["raw_content"] = get_content(body.extract())
        item["image_url"] = (
            "#".join([self.modify_image_url(get_trunk(c)) for c in body.xpath(".//img/@src").extract()]) or None
        )

        return item
Example #5
0
    def parse(self, response):
        symbol = (self.timestamp, response.url)
        self.logger.info('Parsing %s Wangjia Rating From <%s>.' % symbol)

        rating_list = []
        ratings = response.xpath('//div[@class="main_con1"]/table/tbody/tr')
        for rt in ratings:
            content = rt.xpath('td')
            # Decimal fields can be transformed by django itself.
            item = PingjiItem()
            item['timestamp'] = symbol[0]
            item['name'] = get_content(content[0].xpath('a/text()').extract())
            item['exponent'] = get_content(content[1].xpath('.//text()').extract())
            item['launch_time'] = get_content(content[2].xpath('.//text()').extract())
            item['location'] = get_content(content[3].xpath('span/text()').extract())
            item['deal'] = get_content(content[4].xpath('.//text()').extract())
            item['popularity'] = get_content(content[5].xpath('.//text()').extract())
            item['profit'] = get_content(content[6].xpath('.//text()').extract())
            item['dispersity'] = get_content(content[7].xpath('.//text()').extract())
            item['mobility'] = get_content(content[8].xpath('.//text()').extract())
            item['transparency'] = get_content(content[9].xpath('.//text()').extract())


            #log_empty_fields(item, self.logger)
            if item.get_uk(): rating_list.append(item)

        return rating_list
Example #6
0
    def parse(self, response):
        attr_type = response.url.split('/')[-1]
        symbol = (self.mapping.get(response.url), self.url_attr_map[attr_type],
                  response.url)
        self.logger.info('Parsing ID.%d 39health Disease %s Info From <%s>.' %
                         symbol)

        item = DiseaseDetailInfoItem()
        item['d_id'] = symbol[0]
        if attr_type == 'jbzs':
            try:
                item['name'] = get_content(
                    response.xpath('//h1/text()').extract())
                item['description'] = get_content(
                    response.xpath('//div[@class="chi-know"]').extract())
            except:
                pass
        else:
            try:
                item[self.url_attr_db_map[attr_type]] = get_content(
                    response.xpath('//div[@class="art-box"]').extract())
            except:
                pass

        return item
Example #7
0
	def parse(self, response):
		self.logger.info('Parsing Yinrendai Tender List Info From <%s>.' % response.url)

		item_list = []
		tender_list = response.xpath('//li[@class="clearfix"]')
		for tender in tender_list:
			item = ToubiaoItem()
			item['loan_type'] = get_content(tender.xpath('div/@class').extract()).split('_')[-1]

			left = tender.xpath('div/div[@class="leftpart"]')
			if left:
				item['loan_url'] = get_content(left.xpath('./h3/a/@href').extract())
				item['pin'] = self.get_pin_from_url(get_content(left.xpath('h3/a/@href').extract()))
				item['loan_description'] = get_content(left.xpath('h3/a/text()').extract())
				item['warrant_icon'] = get_content(left.xpath('h3/a/span/@class').extract())

				item['progress'] = get_content(left.xpath('div[@class="l bidDetail"]/p/text()').extract())
				item['volume'] = get_content(left.xpath('div[@class="l bid_total"]/h4/span/text()').extract())
				item['interest_rate'] = get_content(left.xpath('div[@class="l bid_rate"]/h4/span/text()').extract())
				item['term'] = get_content(left.xpath('div[@class="l bidInfor"]/h4/span/text()').extract())

			right = tender.xpath('div/div[@class="rightpart"]')
			if right:
				item['status'] = self.status_list.get(get_content(right.xpath('div/@class').extract()))

			item_list.append(item)

		return item_list
Example #8
0
    def parse_news_detail(self, response):

        news = NewsItem()
        news['thread'] = self.get_thread_from_url(response.url)
        news['source'] = response.url
        news['title'] = get_content(response.xpath('//title/text()').extract())

        news['created'] = get_content(
            response.xpath('//small/span[last()]/text()').extract())
        news['author'] = response.xpath(
            '//meta[@name="author"]/@content').extract_first()

        news['summary'] = response.xpath(
            '//meta[@name="description"]/@content').extract_first()

        news['keywords'] = response.xpath(
            '//meta[@name="keywords"]/@content').extract_first()

        news['category'] = get_content(
            response.xpath('//small/span[1]/a/text()').extract())

        article = response.xpath('//div[@class="article-txt"]')
        news['raw_content'] = article.extract_first()
        news['content'] = ''.join(
            [get_trunk(c) for c in article.xpath('.//text()').extract()])

        news['image_url'] = '#'.join(
            [get_trunk(c)
             for c in article.xpath('.//img/@src').extract()]) or None

        # print(news)
        yield news
Example #9
0
    def parse(self, response):
        symbol = (self.timestamp, response.url)
        self.logger.info('Parsing %s Wangjia Rating From <%s>.' % symbol)

        rating_list = []
        ratings = response.xpath('//div[@class="mod-tablelists"]/table/tbody/tr')

        for rt in ratings:
            content = rt.xpath('td')
            # Decimal fields can be transformed by django itself.
            item = PingjiItem()
            item['timestamp'] = symbol[0]
            item['name'] = get_content(content[1].xpath('div/a/text()').extract())
            item['exponent'] = get_content(content[2].xpath('div/em/text()').extract())
            item['launch_time'] = get_content(content[3].xpath('.//text()').extract())
            item['location'] = get_content(content[4].xpath('.//text()').extract())
            item['deal'] = get_content(content[5].xpath('.//text()').extract())
            item['popularity'] = get_content(content[6].xpath('.//text()').extract())
            item['lever'] = get_content(content[7].xpath('.//text()').extract())
            item['dispersity'] = get_content(content[8].xpath('.//text()').extract())
            item['mobility'] = get_content(content[9].xpath('.//text()').extract())
            item['transparency'] = get_content(content[10].xpath('.//text()').extract())


            #log_empty_fields(item, self.logger)
            if item.get_uk():
                rating_list.append(item)

        return rating_list
Example #10
0
    def parse_site_info(self, response):
        company = SiteInfoItem()
        company['link'] = response.url
        company['code'] = response.meta['code']
        site_info = response.xpath('//div[@id="site-plate"]')
        for tr in site_info.xpath('table[@class="table"]//tr'):
            key = get_content(tr.xpath('string(td[1])').extract())
            value = get_content(tr.xpath('string(td[2])').extract())
            if not key or not value: continue
            if key.find(u'平台地址') >= 0:
                company['website'] = value
            elif key.find(u'平台简称') >= 0:
                company['short_name'] = value
            elif key.find(u'上线运营时间') >= 0:
                company['online_time'] = value
            elif key.find(u'许可') >= 0:
                company['license'] = value
            elif key.find(u'应用') >= 0:
                company['app'] = value
            elif key.find(u'微信') >= 0:
                company['wechat'] = value

        certification = dict()
        for tr in site_info.xpath('table[@class="small-table"]/tbody/tr'):
            key = get_content(tr.xpath('string(td[1])').extract())
            value = get_content(tr.xpath('string(td[2])').extract())
            certification[key] = value
        company['certification'] = json.dumps(certification,
                                              encoding="UTF-8",
                                              ensure_ascii=False)

        return company
Example #11
0
    def parse_flow_index(self, response, thread, name, link):

        platform_flow = FlowItem()
        platform_flow['thread'] = thread
        platform_flow['name'] = name
        platform_flow['link'] = link

        flow_date = datetime.strptime(self.flow_date, '%Y-%m-%d')
        flow_monitoring = response.xpath('//div[@class="flow-monitoring"]')
        for index_info in flow_monitoring.xpath('./div[@class="bd"]/dl'):
            index_list = list(json.loads(index_info.xpath('./script/text()').re_first(r'=(.*?);').strip()))

            platform_flow_item = copy.deepcopy(platform_flow)
            platform_flow_item['institution'] = get_content(index_info.xpath('./dt/text()').extract())
            platform_flow_item['date'] = flow_date.strftime('%Y-%m-%d')
            platform_flow_item['flow'] = index_list[-1]
            yield platform_flow_item

            # history
            # for i, index_value in enumerate(index_list[::-1]):
            #     platform_flow_item = copy.deepcopy(platform_flow)
            #     platform_flow_item['institution'] = get_content(index_info.xpath('./dt/text()').extract())
            #     platform_flow_item['date'] = (flow_date - timedelta(days=i)).strftime('%Y-%m-%d')
            #     platform_flow_item['flow'] = index_value
            #     yield platform_flow_item

        platform_flow_item = copy.deepcopy(platform_flow)
        platform_flow_item['institution'] = '综合指数'
        platform_flow_item['date'] = flow_date.strftime('%Y-%m-%d')
        platform_flow_item['flow'] = get_content(flow_monitoring.xpath('./div[@class="hd"]/strong/text()').extract())
        yield platform_flow_item
Example #12
0
    def parse(self, response):
        item_list = []
        content = response.xpath(
            '//div[@id="issuePlatList"]/div[starts-with(@class, "rnav")]')
        for sel_ct in content:
            province_name = get_content(
                sel_ct.xpath(
                    'div[@class="til"]/div/p[not(@class="til_num")]/text()').
                extract())
            province_id = ProvinceItem.get_id_by_name(province_name)

            plat_list = sel_ct.xpath('ul[@class="til_cn"]/li')
            for sel_pt in plat_list:
                item = WentiItem()
                item['name'] = get_content(sel_pt.xpath('a/text()').extract())
                purl = get_content(
                    sel_pt.xpath('a/@purl').extract()).split('/')
                while not purl[-1]:
                    purl.pop()
                item['pin'] = purl.pop()
                item['province_id'] = province_id
                item['event_category'] = self.get_event_category_by_classname(
                    get_content(sel_pt.xpath('i/@class').extract()))

                item_list.append(item)

        return item_list
Example #13
0
    def parse_exposure_detail(self, response):

        exposure = ExposureItem()
        exposure['thread'] = self.get_thread_from_url(response.url)
        exposure['source'] = response.url
        exposure['title'] = get_content(response.xpath('//span[@id="thread_subject"]/text()').extract())

        poston = response.xpath('(//div[@class="authi"])[2]/em/text()').extract_first()
        exposure['created'] = poston[poston.index(' ') + 1:]

        exposure['name'] = get_content(response.xpath('//div[@class="typeoption"]//tr[1]/td/text()').extract())
        exposure['link'] = get_content(response.xpath('//div[@class="typeoption"]//tr[2]/td/a/text()').extract())
        exposure['reason'] = get_content(response.xpath('//div[@class="typeoption"]//tr[3]/td/text()').extract())

        body = response.xpath('//td[contains(@id, "postmessage")]')

        exposure['content'] = ''.join([get_trunk(c) for c in body.xpath('.//text()').extract()])
        exposure['raw_content'] = body.extract_first()
        exposure['image_url'] = '#'.join([self.modify_image_url(get_trunk(c)) for c in response.xpath(
            '//ignore_js_op//img[re:test(@zoomfile, "^data")]/@zoomfile').extract()]) or None

        # exposure['image_url'] = response.xpath('//ignore_js_op//img[re:test(@src, "^data")]/@src').extract()

        # print(exposure)
        yield exposure
Example #14
0
    def parse(self, response):
        symbol = (self.mapping.get(self.get_pin_from_url(response.url)),
                  response.url)
        self.logger.info('Parsing ID.%d Yinrendai Bidder List Info From <%s>' %
                         symbol)
        self.object = ToubiaoItem.get_object_by_pk(symbol[0])

        item_list = []
        record = response.xpath('//table[@class="bidRecord"]//tr')

        for row in record:
            item = BiaorenItem()
            detail = row.xpath('.//td')
            if not detail: continue

            item['pin'] = self.object.pin
            item['bid_nikename'] = get_content(
                detail[0].xpath('text()').extract())
            item['bid_amount'] = get_content(
                detail[1].xpath('text()').extract())
            item['bid_time'] = get_content(detail[2].xpath('text()').extract())

            item_list.append(item)

        return item_list
Example #15
0
    def parse(self, response):
        self.logger.info('Parsing 39 Disease URLs From <%s>.' % response.url)

        item_list = []
        elements = response.xpath('//div[@class="res_list"]')
        for ele in elements:
            item = DiseaseItem()
            item['name'] = get_content(
                ele.xpath('dl/dt/h3/a/text()').extract())
            item['link'] = get_content(ele.xpath('dl/dt/h3/a/@href').extract())
            try:
                item['alias'] = get_content(
                    ele.xpath('dl/dt/cite/text()').extract())
                symptoms_list = ele.xpath('div/p/a')
                relevant_symptoms = []
                for s in symptoms_list:
                    rs = get_content(s.xpath('text()').extract())
                    if rs:
                        relevant_symptoms.append(rs)
                item['relevant_symptoms'] = ' '.join(relevant_symptoms)
            except:
                pass
            item_list.append(item)

        return item_list
Example #16
0
    def parse(self, response):
        tc = self.get_thread_category_from_url(response.url)
        if not tc[0] or not tc[1]:
            self.logger.warning('Invalid Wangjia News Item From <%s>.' % response.url)
            return None

        symbol = (tc[0], self.tab[tc[1]], response.url)
        if response.xpath('//div[@id="messagetext" and @class="alert_info"]'):
            self.logger.warning('No.%s Wangjia News %s Item From <%s> Maybe Limited.' % symbol)
            return None

        self.logger.info('Parsing No.%s Wangjia News %s Item From <%s>.' % symbol)

        item = XinwenItem()
        item['thread'] = int(symbol[0])
        item['category_id'] = tc[1]
        item['source'] = symbol[2]

        article = response.xpath('//div[@class="show-box"]')
        item['title'] = get_content(article.xpath('h1/text()').extract())

        subtitle = article.xpath('div[@class="s-bq"]/span')
        item['created'] = subtitle[0].xpath('text()').extract()[0]
        if len(subtitle) >= 3:
            item['author'] = get_content(subtitle[2].xpath('text()').extract()).split(u':')[1]
        item['summary'] = get_content(article.xpath('div[@class="s-zy"]/span/text()').extract())

        body = article.xpath('div[@class="c-cen"]')
        item['content'] = ''.join([get_trunk(c) for c in body.xpath('.//text()').extract()])
        item['raw_content'] = get_content(body.extract())
        item['image_url'] = '#'.join([self.modify_image_url(get_trunk(c)) for c in body.xpath('.//img/@src').extract()]) or None

        self.logger.info(item)
        return None
Example #17
0
    def parse_detail(self, response):

        member = MemberItem()

        info = response.xpath('//div[@id="tytext"]')
        member['name'] = get_content(info.xpath('h1/text()').extract())
        member['date'] = get_content(
            info.xpath('p[@class="tytdate"]/text()').extract())
        member['link'] = response.url
        if len(info.xpath('./div/p')) > 0:
            for p in info.xpath('./div/p'):
                content = get_content(p.xpath('string(.)').extract())
                print(member['name'])
                print(content)
                print('--------1')
                if content == None: continue
                if content.find(u'网址') >= 0:
                    member['website'] = content.split(':')[-1]
                elif content.find(u'电话') >= 0:
                    member['phone'] = content.split(':')[-1]
                elif content.find(u'地址') >= 0:
                    member['address'] = content.split(':')[-1]
                elif content.find(u'邮编') >= 0:
                    member['zip'] = content.split(':')[-1]
        elif len(info.xpath('./p')) < 4:
            content = info.xpath('string(./p[2])').extract_first().split('\n')

            for s in content:
                print(s)
                print('--------2')
                value = get_trunk(s.split(u':')[-1])
                if s.find(u'网址') >= 0:
                    member['website'] = value
                elif s.find(u'电话') >= 0:
                    member['phone'] = value
                elif s.find(u'地址') >= 0:
                    member['address'] = value
                elif s.find(u'邮编') >= 0:
                    member['zip'] = value
        else:
            for p in info.xpath('./p'):
                content = get_content(p.xpath('string(.)').extract())
                print(member['name'])
                print(content)
                print('--------3')
                if content == None: continue
                if content.find(u'网址') >= 0:
                    member['website'] = content.split(':')[-1]
                elif content.find(u'电话') >= 0:
                    member['phone'] = content.split(':')[-1]
                elif content.find(u'地址') >= 0:
                    member['address'] = content.split(':')[-1]
                elif content.find(u'邮编') >= 0:
                    member['zip'] = content.split(':')[-1]

        if member['website'][0] == '/':
            member['website'] = 'http:' + member['website']
        yield member
Example #18
0
 def parse_list(self, response):
     for member in response.xpath(
             '//div[@class="memberab_lsit" or @class="memtab_list"]/ul/li/a'
     ):
         name = get_content(member.xpath('text()').extract())
         link = 'http://old.iachina.cn/' + get_content(
             member.xpath('@href').extract())
         yield scrapy.Request(url=link,
                              callback=self.parse_detail,
                              dont_filter=True)
Example #19
0
 def parse_report_list(self, response):
     for report in response.xpath('//ul[@class="reportList"]/li/a'):
         title = get_content(report.xpath('./text()').extract())
         link = 'http://www.dailuopan.com' + get_content(
             report.xpath('./@href').extract())
         print(link)
         yield scrapy.Request(url=link,
                              meta={'category': response.meta['category']},
                              callback=self.parse_detail,
                              dont_filter=True)
Example #20
0
    def parse_govern_info(self, response):
        name = get_content(
            response.xpath('//div[@class="comp-intro"]').xpath(
                './/div[@class="intro-txt"]/span')[0].xpath(
                    'string(.)').extract())

        company = GovernInfoItem()
        company['link'] = response.url
        company['name'] = name
        company['code'] = response.meta['code']

        govern_info = response.xpath('//div[@id="govern-info"]')

        company['structure'] = get_content(
            response.xpath(
                '//div[@class="mask"]/img[@class="mask-img"]/@src').extract())

        relation = dict()
        for tr in govern_info.xpath('table[2]/tbody/tr'):
            key = get_content(tr.xpath('string(td[1])').extract())
            value = get_content(tr.xpath('string(td[2])').extract())
            relation[key] = value
        company['relation'] = json.dumps(relation,
                                         encoding="UTF-8",
                                         ensure_ascii=False)

        controller = govern_info.xpath(
            'table[3]/tbody/tr[1]/td/text()').extract()
        company['controller'] = json.dumps(controller,
                                           encoding="UTF-8",
                                           ensure_ascii=False)

        shareholder_list = list()
        for tr in govern_info.xpath('table[4]/tbody/tr'):
            shareholder_list.append([
                get_trunk(item) for item in tr.xpath('td//text()').extract()
                if get_trunk(item) != ''
            ])

        company['shareholder_list'] = json.dumps(shareholder_list,
                                                 encoding="UTF-8",
                                                 ensure_ascii=False)

        manager_list = list()
        for tr in govern_info.xpath('table[5]/tbody/tr'):
            manager_list.append([
                get_trunk(item) for item in tr.xpath('td//text()').extract()
                if get_trunk(item) != ''
            ])

        company['manager_list'] = json.dumps(manager_list,
                                             encoding="UTF-8",
                                             ensure_ascii=False)

        return company
Example #21
0
    def parse(self, response):
        self.logger.info('Parsing Wangjia Problem Platform From <%s>.' % response.url)

        platform_list = []
        platforms = response.xpath('//div[@class="wtpt"]/div/table/tbody/tr')
        for rt in platforms:
        #for idx, rt in enumerate(platforms[1:]):
            content = rt.xpath('td')

            item = WentiItem()
            item['name'] = get_content(content[1].xpath('.//text()').extract())
            item['problem_time'] = get_content(content[2].xpath('text()').extract(), exclude=('-'))
            item['launch_time'] = get_content(content[3].xpath('text()').extract(), exclude=('-'))
            item['registered_capital'] = get_content(content[4].xpath('text()').extract(), exclude=('-'))
            #if idx == 179: item['province_id'] = 22
            #else:
            province_name = get_content(content[5].xpath('text()').extract())
            item['province_id'] = ProvinceItem.get_id_by_name(province_name)
            if item['province_id'] is None: item.pop('province_id')
            #print item.get_uk(), province_name, item['province_id']
            item['accounted_revenue'] = get_content(content[6].xpath('text()').extract(), exclude=('-'))
            item['involved_passenger'] = get_content(content[7].xpath('text()').extract(), exclude=('-'))
            item['event_category'] = get_content(content[8].xpath('text()').extract(), exclude=('-'))

            #log_empty_fields(item, self.logger)
            if item.get_uk(): platform_list.append(item)

        return platform_list
Example #22
0
 def parse_company_list(self, response):
     for member_info in response.xpath(
             '//*[@id="hysjbox"]/div[2]/table/tbody/tr'):
         name = get_content(member_info.xpath('string(./td[2])').extract())
         if MemberItem.get_member(name=name):
             member = MemberItem()
             member['name'] = name
             member['position'] = get_content(
                 member_info.xpath('string(./td[3])').extract())
             member['represent'] = get_content(
                 member_info.xpath('string(./td[4])').extract())
             member['type'] = get_content(
                 member_info.xpath('string(./td[5])').extract())
             yield member
Example #23
0
    def parse(self, response):
        self.logger.info('Parsing Wangjia Problem Platform From <%s>.' %
                         response.url)

        platform_list = []
        platforms = response.xpath('//div[@class="wtpt"]/div/table/tbody/tr')
        for rt in platforms:
            #for idx, rt in enumerate(platforms[1:]):
            content = rt.xpath('td')

            item = WentiItem()
            item['name'] = get_content(content[1].xpath('.//text()').extract())
            item['problem_time'] = get_content(
                content[2].xpath('text()').extract(), exclude=('-'))
            item['launch_time'] = get_content(
                content[3].xpath('text()').extract(), exclude=('-'))
            item['registered_capital'] = get_content(
                content[4].xpath('text()').extract(), exclude=('-'))
            #if idx == 179: item['province_id'] = 22
            #else:
            province_name = get_content(content[5].xpath('text()').extract())
            item['province_id'] = ProvinceItem.get_id_by_name(province_name)
            if item['province_id'] is None: item.pop('province_id')
            #print item.get_uk(), province_name, item['province_id']
            item['accounted_revenue'] = get_content(
                content[6].xpath('text()').extract(), exclude=('-'))
            item['involved_passenger'] = get_content(
                content[7].xpath('text()').extract(), exclude=('-'))
            item['event_category'] = get_content(
                content[8].xpath('text()').extract(), exclude=('-'))

            #log_empty_fields(item, self.logger)
            if item.get_uk(): platform_list.append(item)

        return platform_list
Example #24
0
    def parse(self, response):
        symbol = (self.mapping.get(response.url), response.url)
        self.logger.info(
            "Parsing ID.%d 39Health News Disease Ditail From <%s>" % symbol)
        self.object = NewsListItem.get_object_by_pk(symbol[0])

        item = XinwenItem()
        left = response.xpath('//div[@class="art_left"]')
        if left:
            item['title'] = get_content(left.xpath('div/h1/text()').extract())

            info = left.xpath('div/div[@class="art_info"]')
            detail = info.xpath('div[@class="date"]//em')
            item['time'] = get_content(detail[0].xpath('text()').extract())

            source = detail[1].xpath('a')
            if source:
                item['source_website_link'] = get_content(
                    source.xpath('@href').extract())
                item['source_website'] = get_content(
                    source.xpath('text()').extract())
            else:
                item['source_website'] = get_content(
                    detail[1].xpath('text()').extract())

            item['source_author'] = get_content(
                detail[2].xpath('text()').extract(), skipBlank=False)

            item['summary'] = get_content(
                left.xpath('div/p[@class="summary"]/text()').extract())
            item['content'] = get_content(
                left.xpath('div/div[@class="art_con"]').extract())
        return item
Example #25
0
	def parse(self, response):
		symbol = (self.mapping.get(response.url), response.url)
		self.logger.info("Parsing ID.%d 39Health News Disease Ditail From <%s>" % symbol)
		self.object = NewsListItem.get_object_by_pk(symbol[0])

		item = XinwenItem()
		left = response.xpath('//div[@class="art_left"]')
		if left:
			item['title'] = get_content(left.xpath('div/h1/text()').extract())

			info = left.xpath('div/div[@class="art_info"]')
			detail = info.xpath('div[@class="date"]//em')
			item['time'] = get_content(detail[0].xpath('text()').extract())
			
			source = detail[1].xpath('a')
			if source:
				item['source_website_link'] = get_content(source.xpath('@href').extract())
				item['source_website'] = get_content(source.xpath('text()').extract())
			else:
				item['source_website'] = get_content(detail[1].xpath('text()').extract())

			item['source_author'] = get_content(detail[2].xpath('text()').extract(), skipBlank=False);
			
			item['summary'] = get_content(left.xpath('div/p[@class="summary"]/text()').extract())
			item['content'] = get_content(left.xpath('div/div[@class="art_con"]').extract())
		return item
Example #26
0
    def parse_news_detail(self, response):

        news = NewsItem()
        news['thread'] = self.get_thread_from_url(response.url)
        news['source'] = response.url
        news['title'] = get_content(response.xpath('//h1/text()').extract())

        news['created'] = response.meta['created']
        news['author'] = response.meta['author']

        news['category'] = response.meta['category']

        news['summary'] = response.meta['summary']

        article = response.xpath('//td[@id="article_content"]')
        news['raw_content'] = article.extract_first()
        news['content'] = ''.join([
            get_trunk(c) for c in article.xpath(
                './/p[contains(@class, "ke-editor-inner-p")]/text()').extract(
                )
        ])

        news['image_url'] = '#'.join([
            self.modify_image_url(get_trunk(c))
            for c in article.xpath('.//img/@src').extract()
        ]) or None

        yield news
Example #27
0
    def parse(self, response):
        item_list = []
        plats = response.xpath('//div[@class="c_module2 clear"]/div[@class="main"]/div[@class="warp"]/div[@class="c_modreg"]/ul/li')
        for plat in plats:
            item = FeatureItem()
            url = get_content(plat.xpath('a/@href').extract())
            purl = url.split('/')
            while purl and not purl[-1]: purl.pop()
            if purl: item['pin'] = purl.pop().split('.')[0]
            if item['pin'] in ['www', 'statistics', '']: continue
            item['name'] = get_content(plat.xpath('a/text()').extract())
            item['link'] = url

            item_list.append(item)

        return item_list
Example #28
0
 def parse_platform_list(self, response):
     for info in response.xpath('//tbody/tr'):
         href = 'http://www.dailuopan.com' + info.xpath('./td[last()]/a/@href').extract_first()
         name = get_content(info.xpath('./td[2]/a[1]/text()').extract())
         yield scrapy.Request(url=href,
                              callback=self.parse_detail,
                              meta={'name': name, 'link': href},
                              dont_filter=True)
Example #29
0
    def parse(self, response):
        self.logger.info('Parsing Wangjia Rating Item URLs From <%s>.' % response.url)

        item = ExporterItem()
        elements = response.xpath('//table[@id="rateTable_body"]/tbody/tr')
        for ele in elements:
            item.set_record(self.url_prefix + get_content(ele.xpath('td/a[@class="pname"]/@href').extract()))

        return item
Example #30
0
    def parse(self, response):
        self.logger.info('Parsing 39Health News Disease URLs From <%s>.' % response.url)

        item_list = []
        elements = response.xpath('//div[@class="listbox"]//ul')

        for ele in elements:
            detail_list = ele.xpath('li')
            for detail in detail_list:
                item = NewsListItem()
                item['category_id'] = self.category_id
                item['link'] = get_content(detail.xpath('span/a/@href').extract())
                item['title'] = get_content(detail.xpath('span')[0].xpath('a/text()').extract())
                item['time'] = get_content(detail.xpath('span')[1].xpath('text()').extract(), skipBlank=False)
                if not self.judgeYear(item['time']):
                    continue
                item_list.append(item)
        return item_list
Example #31
0
    def parse_cooperation_product_list(self, response):

        cooperation = response.meta['cooperation']

        product_list = list()
        for entry in response.xpath('//div[@class="xz_nei_lxf"]/ul/li'):
            actual_name = get_content(entry.xpath('string(p[1])').extract())
            record_name = get_content(entry.xpath('string(p[2])').extract())
            product_list.append({
                'actual_name': actual_name,
                'record_name': record_name
            })

        if response.meta['type'] == 'cur':
            cooperation['cur_product_list'] = json.dumps(product_list,
                                                         encoding="UTF-8",
                                                         ensure_ascii=False)
            # 历史
            company = cooperation['company']
            yield scrapy.FormRequest(
                url='http://icid.iachina.cn/ICID/front/viewAllProHis.do',
                method='POST',
                formdata={
                    'columnid': company.column_id,
                    'internetInformationNo': company.info_no,
                    'zj': company.zj,
                    'terraceNo': cooperation['terrace_no'],
                    'oldTerraceNo': cooperation['old_terrace_no'],
                    'type': cooperation['flag'],
                    'comType': cooperation['type']
                },
                meta={
                    'cooperation': cooperation,
                    'type': 'his'
                },
                callback=self.parse_cooperation_product_list,
                dont_filter=True)

        else:
            cooperation['his_product_list'] = json.dumps(product_list,
                                                         encoding="UTF-8",
                                                         ensure_ascii=False)
            yield cooperation
Example #32
0
    def parse_exposure_list(self, response):

        for exposure_abs in response.xpath('//div[contains(@class, "item")]'):
            href = exposure_abs.xpath('.//div[@class="forum-main left"]/a/@href').extract_first()
            title = get_content(exposure_abs.xpath('.//div[@class="forum-main left"]/a/div/text()').extract_first())

            yield scrapy.Request(url="http://www.p2peye.com" + href,
                                 callback=self.parse_exposure_detail,
                                 meta={'title': title},
                                 dont_filter=True)
Example #33
0
    def parse_cooperation_detail(self, response):
        cooperation = response.meta['cooperation']
        company = cooperation['company']

        for entry in response.xpath('//div[@class="ppp"]/p'):
            key = get_content(entry.xpath('span/text()').extract())
            value = get_content(entry.xpath('text()').extract()).replace(
                u':', '')

            if u'全称' in key:
                cooperation['full_name'] = value
            elif u'简称' in key:
                cooperation['short_name'] = value
            elif u'地址' in key:
                cooperation['website'] = value
            elif u'备案' in key:
                cooperation['records'] = value
            elif u'范围' in key:
                cooperation['scope'] = value
            elif u'起始' in key:
                cooperation['start_date'] = value
            elif u'终止' in key:
                cooperation['end_date'] = value

        yield scrapy.FormRequest(
            url='http://icid.iachina.cn/ICID/front/viewAllPro.do',
            method='POST',
            formdata={
                'columnid': company.column_id,
                'internetInformationNo': company.info_no,
                'zj': company.zj,
                'terraceNo': cooperation['terrace_no'],
                'oldTerraceNo': cooperation['old_terrace_no'],
                'type': cooperation['flag'],
                'comType': cooperation['type']
            },
            meta={
                'cooperation': cooperation,
                'type': 'cur'
            },
            callback=self.parse_cooperation_product_list,
            dont_filter=True)
Example #34
0
    def parse(self, response):
        item_list = []
        content = response.xpath('//div[@id="issuePlatList"]/div[starts-with(@class, "rnav")]')
        for sel_ct in content:
            province_name = get_content(sel_ct.xpath('div[@class="til"]/div/p[not(@class="til_num")]/text()').extract())
            province_id = ProvinceItem.get_id_by_name(province_name)

            plat_list = sel_ct.xpath('ul[@class="til_cn"]/li')
            for sel_pt in plat_list:
                item = WentiItem()
                item['name'] = get_content(sel_pt.xpath('a/text()').extract())
                purl = get_content(sel_pt.xpath('a/@purl').extract()).split('/')
                while not purl[-1]: purl.pop()
                item['pin'] = purl.pop()
                item['province_id'] = province_id
                item['event_category'] = self.get_event_category_by_classname(get_content(sel_pt.xpath('i/@class').extract()))

                item_list.append(item)

        return item_list
Example #35
0
 def parse_list(self, response):
     for report_item in response.xpath(
             '//table[contains(@id, "ListC_Info_LstC_Info")]/tr'):
         title = get_content(
             report_item.xpath('.//td[@class="hui14"]//a/text()').extract())
         id = get_content(
             report_item.xpath('.//td[@class="hui14"]//a/@id').re(r'\d+'))
         link = 'http://www.circ.gov.cn' + get_content(
             report_item.xpath('.//td[@class="hui14"]//a/@href').extract())
         created = get_content(
             report_item.xpath('.//td[@class="hui14"]/../td[last()]/text()'
                               ).extract())[1:-1]
         yield scrapy.Request(url=link,
                              callback=self.parse_detail,
                              meta={
                                  'title': title,
                                  'id': id,
                                  'created': created
                              },
                              dont_filter=True)
    def parse(self, response):
        attr_type = response.url.split('/')[-1]
        symbol = (self.mapping.get(response.url), self.url_attr_map[attr_type], response.url)
        self.logger.info('Parsing ID.%d 39health Disease %s Info From <%s>.' % symbol)

        item = DiseaseDetailInfoItem()
        item['d_id'] = symbol[0]
        if attr_type == 'jbzs':
            try:
                item['name'] = get_content(response.xpath('//h1/text()').extract())
                item['description'] = get_content(response.xpath('//div[@class="chi-know"]').extract())
            except:
                pass
        else:
            try:
                item[self.url_attr_db_map[attr_type]] = get_content(response.xpath('//div[@class="art-box"]').extract())
            except:
                pass

        return item
Example #37
0
    def parse_detail(self, response):
        product = ProductItem()
        code = response.meta['code']
        product['code'] = code
        product[
            'pdf'] = 'http://www.iachina.cn/IC/tkk/03/' + code + '_TERMS.PDF'
        product['link'] = response.url

        update_fields_list = [
            'link', 'company_name', 'product_name', 'product_type',
            'design_type', 'feature', 'insured', 'period_type', 'pay_type',
            'clause', 'state', 'end_date', 'summary', 'pdf'
        ]

        for entry in response.xpath('//table[@class="biaoge"]/tr'):
            key = get_content(entry.xpath('string(td[1])').extract())
            value = get_content(entry.xpath('string(td[2])').extract())
            if key.find(u'公司名称') >= 0:
                product['company_name'] = value
            elif key.find(u'产品名称') >= 0:
                product['product_name'] = value
            elif key.find(u'产品类别') >= 0:
                product['product_type'] = value
            elif key.find(u'设计类型') >= 0:
                product['design_type'] = value
            elif key.find(u'产品特殊属性') >= 0:
                product['feature'] = value
            elif key.find(u'承保方式') >= 0:
                product['insured'] = value
            elif key.find(u'保险期间类型') >= 0:
                product['period_type'] = value
            elif key.find(u'产品交费方式') >= 0:
                product['pay_type'] = value
            elif key.find(u'条款文字编码') >= 0:
                product['clause'] = value
            elif key.find(u'销售状态') >= 0:
                product['state'] = value
            elif key.find(u'停止销售日期') >= 0:
                product['end_date'] = value

        yield product
Example #38
0
    def parse_product_list(self, response):
        company = response.meta['company']

        product_list = list()
        for entry in response.xpath('//div[@class="ge"]/ul/li'):
            actual_name = get_content(entry.xpath('string(p[1])').extract())
            record_name = get_content(entry.xpath('string(p[2])').extract())
            record_no = get_content(entry.xpath('string(p[3])').extract())
            product_list.append({
                'actual_name': actual_name,
                'record_name': record_name,
                'record_no': record_no
            })
        if response.meta['type'] == 'cur':
            company['cur_product_list'] = json.dumps(product_list,
                                                     encoding="UTF-8",
                                                     ensure_ascii=False)
            # 历史保险产品信息
            yield scrapy.FormRequest(
                url='http://icid.iachina.cn/ICID/front/viewAllProsHis.do',
                method='POST',
                formdata={
                    'columnid': company['column_id'],
                    'internetInformationNo': company['info_no'],
                    'informationno': company['info_no'],
                    'zj': company['zj']
                },
                meta={
                    'company': company,
                    'type': 'his'
                },
                callback=self.parse_product_list,
                dont_filter=True)

        else:
            company['his_product_list'] = json.dumps(product_list,
                                                     encoding="UTF-8",
                                                     ensure_ascii=False)
            company['type'] = ['人身险', '财产险',
                               '中介类'][int(company['column_id'][-1]) - 1]
            yield company
Example #39
0
	def parse(self, response):
		symbol = (self.mapping.get(self.get_pin_from_url(response.url)), response.url)
		self.logger.info('Parsing ID.%d Yinrendai Bidder List Info From <%s>' % symbol)
		self.object = ToubiaoItem.get_object_by_pk(symbol[0])

		item_list=[]
		record = response.xpath('//table[@class="bidRecord"]//tr')

		for row in record:
			item = BiaorenItem()
			detail = row.xpath('.//td')
			if not detail:	continue

			item['pin'] = self.object.pin
			item['bid_nikename'] = get_content(detail[0].xpath('text()').extract())
			item['bid_amount'] = get_content(detail[1].xpath('text()').extract())
			item['bid_time'] = get_content(detail[2].xpath('text()').extract())

			item_list.append(item)

		return item_list
Example #40
0
    def parse(self, response):
        self.logger.info('Parsing 39 Symptom URLs From <%s>.' % response.url)

        item_list = []
        elements = response.xpath('//div[@class="res_list"]')
        for ele in elements:
            item = SymptomItem()
            item['name'] = get_content(ele.xpath('dl/dt/h3/a/@title').extract())
            item['link'] = get_content(ele.xpath('dl/dt/h3/a/@href').extract())
            try:
                item['alias'] = get_content(ele.xpath('dl/dt/cite/@title').extract())
                disease_list = ele.xpath('div/p/a')
                relevant_diseases = []
                for d in disease_list:
                    rd = get_content(d.xpath('text()').extract())
                    if rd: relevant_diseases.append(rd)
                item['relevant_diseases'] = ' '.join(relevant_diseases)
            except:
                pass
            item_list.append(item)

        return item_list
    def parse(self, response):
        attr_type = response.url.split('/')[-1]

        symbol = (self.mapping.get(response.url), self.url_attr_map[attr_type], response.url)
        self.logger.info('Parsing ID.%d 39health Symptom %s Info From <%s>.' % symbol)

        item = SymptomDetailInfoItem()
        item['s_id'] = symbol[0]
        if attr_type == '':
            try:
                item['name'] = get_content(response.xpath('//h1/text()').extract())
                item['description'] = get_content(response.xpath('//dd[@id="intro"]/p/text()').extract())
            except:
                pass
        else:
            try:
                item[self.url_attr_db_map[attr_type]] = \
                    get_content(response.xpath('//div[@class="item catalogItem"]').extract())
            except:
                pass

        return item
Example #42
0
    def parse(self, response):
        self.logger.info('Parsing Wangjia News %s URLs From <%s>.' % (self.category, response.url))

        item = ExporterItem()
        elements = response.xpath('//div[contains(@class, "specialBox")]//div[@class="news_title"]')
        for ele in elements:
            url = get_content(ele.xpath('a/@href').extract())
            if url.find(self.category) == -1: continue

            thread = get_thread_from_news_url(url)
            if int(self.max_thread) < int(thread):
                item.set_record(url)

        return item
Example #43
0
    def parse(self, response):
        self.logger.info('Parsing 39 Disease URLs From <%s>.' % response.url)

        item_list = []
        elements = response.xpath('//div[@class="res_list"]')
        for ele in elements:
            item = DiseaseItem()
            item['name'] = get_content(ele.xpath('dl/dt/h3/a/text()').extract())
            item['link'] = get_content(ele.xpath('dl/dt/h3/a/@href').extract())
            try:
                item['alias'] = get_content(ele.xpath('dl/dt/cite/text()').extract())
                symptoms_list = ele.xpath('div/p/a')
                relevant_symptoms = []
                for s in symptoms_list:
                    rs = get_content(s.xpath('text()').extract())
                    if rs:
                        relevant_symptoms.append(rs)
                item['relevant_symptoms'] = ' '.join(relevant_symptoms)
            except:
                pass
            item_list.append(item)

        return item_list
Example #44
0
    def parse(self, response):
        self.logger.info('Parsing P2peye Archive Feature From <%s>.' % response.url)

        item = FeatureItem()
        item['pin'] = self.get_pin_from_url(response.url)

        feature_list = response.xpath('//div[@class="bd ui-yun-parent"]/a')
        features = []
        if feature_list:
            for fl in feature_list:
                fc = get_content(fl.xpath('text()').extract())
                if fc: features.append(fc)
        item['feature'] = ' '.join(features)

        return item
Example #45
0
    def parse(self, response):
        self.logger.info('Parsing Wangjia Exporsure URLs From <%s>.' % response.url)

        item = ExporterItem()
        elements = response.xpath('//table[starts-with(@summary, "forum")]/tbody')
        #elements = response.xpath('//div[@class="comeing_channel_tab_area"]/table/tbody')
        for ele in elements:
            content = ele.xpath('tr/th[@class="new"]')
            #content = ele.xpath('tr/td[@class="comeing_channel_threadlist_sub"]')
            if not content: continue

            url = get_content(content.xpath('a[contains(@class, "xst")]/@href').extract())
            thread = get_thread_from_exposure_url(url)
            if int(self.max_thread) < int(thread):
                item.set_record(url)

        return item
    def parse(self, response):
        symbol = (self.mapping.get(response.url), response.url)
        self.logger.info('Parsing ID.%d 39health Disease Elementary Info From <%s>.' % symbol)

        disease_ele_item = DiseaseElementaryInfoItem()
        try:
            disease_ele_item['d_id'] = symbol[0]
            disease_ele_item['name'] = get_content(response.xpath('//dl[@class="intro"]/dt/text()').extract())
            try:
                relative_drug_path = response.xpath('//div[@class="drug"]/ul/li')
                has_drug = get_content(relative_drug_path[0].xpath('i/text()').extract())
                if self.d_map.has_key(has_drug):
                    drug_list = relative_drug_path[0].xpath('a')
                    dn = []
                    for d in drug_list:
                        dl = get_content(d.xpath('@title').extract())
                        if dl:
                            dn.append(dl)
                    disease_ele_item[self.d_map[has_drug]] = ' '.join(dn)
            except:
                pass

            ele = response.xpath('//div[@class="info"]/ul/li')
            for li in ele:
                attr = get_content(li.xpath('i/text()').extract())
                if self.d_map.has_key(attr):
                    if self.type_map[attr]:
                        label_list = li.xpath('a')
                        ll = []
                        for l in label_list:
                            if l.xpath('@title'):
                                lc = get_content(l.xpath('@title').extract())
                            else:
                                lc = get_content(l.xpath('text()').extract())
                            if lc:
                                ll.append(lc)
                        disease_ele_item[self.d_map[attr]] = ' '.join(ll)
                    else:
                        disease_ele_item[self.d_map[attr]] = get_content(li.xpath('text()').extract())

            return disease_ele_item
        except:
            return None
Example #47
0
	def parse(self, response):
		# self.object = self.mapping.get(response.url)
		# symbol = (self.object.manual_id, response.url)
		# self.logger.info("Parsing ID.%d 39Health Drug Informations From <%s>." % symbol)

		symbol = (self.mapping.get(response.url), response.url)
		self.logger.info("Parsing ID.%d 39Health Drug Informations From <%s>." % symbol)

		item = YaopingItem()
		item['manual_id'] = symbol[0]

		sub = response.xpath('//div[@class="subs"]//a')
		item['category_list'] = '>>'.join([get_trunk(s) for s in sub.xpath('text()').extract()])

		category_list = item['category_list'].split(">>")
		if len(category_list) == 3:
			item['category_first'] = category_list[1]
		elif len(category_list)  == 4:
			item['category_first'] = category_list[1]
			item['category_second'] = category_list[2]

		item['name'] = get_content(response.xpath('//div[@class="t1"]/h1/a/text()').extract())
		cites = response.xpath('//div[@class="t1"]//cite')
		item['cites'] = '&&'.join([get_trunk(cite) for cite in cites.xpath('span/text()').extract()])

		item['english_name'] = get_content(response.xpath('//cite[@class="t2"]/text()').extract(), skipBlank=False)

		item['company'] = get_content(response.xpath('//li[@class="company"]/text()').extract())
		item['address'] = get_content(response.xpath('//li[@class="address"]/text()').extract())
		item['telephone'] = get_content(response.xpath('//li[@class="telephone"]/text()').extract(), skipBlank=False)

		information = response.xpath('//div[@class="tab_box"]//dl')
		for info in information:
			key = get_content(info.xpath('dt/text()').extract())
			if self.detail_map.get(key):
				attr = self.detail_map[key]
				detail = info.xpath('dd')

				#using string(.) to remove html label
				item[attr] = get_content(detail.xpath('string(.)').extract()) 

		return item
Example #48
0
    def parse(self, response):
        symbol = (self.get_thread_from_url(response.url), response.url)
        if not symbol[0]:
            self.logger.warning('Invalid Wangjia Exposure Item From <%s>.' % symbol[1])
            return None

        if response.xpath('//div[@class="wrap"]'):
            self.logger.warning('May Redirect To Warning Page Of Wangjia.')
            return None

        if response.xpath('//div[@id="messagetext" and @class="alert_info"]'):
            self.logger.warning('No.%s Wangjia Exposure Item From <%s> Maybe Limited.' % symbol)
            return None

        self.logger.info('Parsing No.%s Wangjia Exposure Item From <%s>.' % symbol)

        item = BaoguangItem()
        item['thread'] = int(symbol[0])
        item['source'] = symbol[1]

        title = response.xpath('//span[@id="thread_subject"]')
        item['title'] = get_content(title.xpath('text()').extract())

        subtitle = response.xpath('//em[starts-with(@id, "authorposton")]')[0]
        poston = get_content(subtitle.xpath('text()').extract(), skipBlank=False)
        item['created'] = poston[poston.index(' ')+1:]

        header = response.xpath('//div[@class="typeoption"]/table/tbody/tr/td')
        if header:
            item['name'] = get_content(header[0].xpath('.//text()').extract())
            item['link'] = get_content(header[1].xpath('.//text()').extract())
            item['reason'] = get_content(header[2].xpath('.//text()').extract())

        body = response.xpath('//td[starts-with(@id, "postmessage")]')[0]
        #item['content'] = ''.join([get_trunk(c) for c in body.xpath('text()|*[not(@class="pstatus")]/text()|*[not(@class="pstatus")]/*/text()').extract()])
        item['content'] = ''.join([get_trunk(c) for c in body.xpath('.//text()').extract()])
        item['raw_content'] = get_content(body.extract())
        item['image_url'] = '#'.join([self.modify_image_url(get_trunk(c)) for c in body.xpath('.//@file').extract()]) or None

        return item
Example #49
0
	def parse(self, response):
		symbol = (self.mapping.get(self.get_pin_from_url(response.url)), response.url)
		self.logger.info('Parsing ID.%d Yinrendai Bid List Info From <%s>' % symbol)
		self.object = ToubiaoItem.get_object_by_pk(symbol[0])

		item = BiaodiItem()
		finance = response.xpath('//div[@class="finance_box clearfix"]')
		if finance:
			left = finance.xpath('div[@class="elite_left l"]')

			left_info = left.xpath('table/tr[@class="num"]/td')
			item['interest_rate'] = get_content(left_info[0].xpath('strong/text()').extract())
			item['term'] = get_content(left_info[1].xpath('strong/text()').extract())
			item['volume'] = get_content(left_info[2].xpath('strong/text()').extract())

			# We use this string format to get the bid detail information easily.
			bid_detail_info = left.xpath('div/p[@class="progressTxt l"]')
			item['bid_detail'] = self.bid_detail_form.format(num=get_content(bid_detail_info.xpath('span/text()').extract()),
				percentage=get_content(bid_detail_info.xpath('//span[@id="percent"]/text()').extract()))

			item['remain_amount'] = get_content(finance.xpath('div[@class="elite_right l"]/p/span/text()').extract())

		detail = response.xpath('//li[@class="oneInfo"]')
		if detail:
			personal_info = detail[0].xpath('table//td[not(@class="dd")]')
			if personal_info:
				item['nikename'] = get_content(personal_info[0].xpath('text()').extract())
				item['gender'] = get_content(personal_info[1].xpath('text()').extract())
				item['phone_number'] = get_content(personal_info[2].xpath('text()').extract())
				item['education'] = get_content(personal_info[3].xpath('text()').extract())
				item['marital_status'] = get_content(personal_info[4].xpath('text()').extract())
				item['house'] = get_content(personal_info[5].xpath('text()').extract())
				item['address'] = get_content(personal_info[6].xpath('text()').extract())

			job_status = detail[1].xpath('table//td[not(@class="dd")]')
			if job_status:
				item['job_type'] = get_content(job_status[0].xpath('text()').extract())
				item['job_city'] = get_content(job_status[1].xpath('text()').extract())
				item['job_year'] = get_content(job_status[2].xpath('text()').extract())
				item['annual_income'] = get_content(job_status[3].xpath('text()').extract())
				item['credit_limit'] = get_content(job_status[4].xpath('text()').extract())

			bid_info = detail[2].xpath('table//td[not(@class="dd")]')
			if bid_info:
				item['loan_volume'] = get_content(bid_info[0].xpath('text()').extract())
				item['loan_term'] = get_content(bid_info[1].xpath('text()').extract())
				item['loan_interest_rate'] = get_content(bid_info[2].xpath('text()').extract())
				item['loan_purpose'] = get_content(bid_info[3].xpath('text()').extract())
				item['payment_method'] = get_content(bid_info[4].xpath('text()').extract())
				item['tender_deadline'] = get_content(bid_info[5].xpath('text()').extract())

		return item
Example #50
0
    def parse(self, response):
        symbol = (self.mapping.get(self.get_pin_from_url(response.url)), response.url)
        self.logger.info('Parsing ID.%d Wangjia Feature From <%s>.' % symbol)
        self.object = DaohangItem.get_object_by_pk(symbol[0])

        item = TedianItem()
        item['name'] = self.object.name

        rtag = response.xpath('//div[@class="rTags"]')
        if rtag:
            item['status'] = get_content(rtag.xpath('./span[@class="tag3"]/text()').extract())
            item['company_tag'] = get_content(rtag.xpath('./span[@class="tag tag2"]/text()').extract())

            tag_info = rtag.xpath('./span[@class = "tag"]')
            item['illustration'] = '/'.join([get_trunk(info) for info in tag_info.xpath('text()').extract()])

        comment_info = response.xpath('//div[contains(@class,"box commentBox")]')
        if comment_info:
            commentScores = comment_info.xpath('./dl[@class="comment"]')
            item['recommendation'] = get_content(commentScores.xpath('./dt/span/text()').extract())

            score = commentScores.xpath('./dd/span[@class="num"]')
            item['withdraw_num'] = get_content(score[0].xpath('text()').extract())
            item['guard_num'] = get_content(score[1].xpath('text()').extract())
            item['service_num'] = get_content(score[2].xpath('text()').extract())
            item['experience_num'] = get_content(score[3].xpath('text()').extract())
    
            scoreInfo = commentScores.xpath('.//span[not(@class="num")]')
            item['withdraw_day'] = get_content(scoreInfo[0].xpath('text()').extract())
            item['guard_day'] = get_content(scoreInfo[1].xpath('text()').extract())
            item['service_status'] = get_content(scoreInfo[2].xpath('text()').extract())
            item['experience_status'] = get_content(scoreInfo[3].xpath('text()').extract())

            impress_info = comment_info.xpath('./dl[@class="impression"]/dd//span')
            item['impression'] = '\001'.join([get_trunk(impress) for impress in impress_info.xpath('text()').extract()])

        return item
Example #51
0
    def parse(self, response):
        symbol = (self.timestamp, response.url)
        self.logger.info('Parsing %s Wangjia Rating From Archive <%s>.' % symbol)

        item = PingjiItem()
        item['timestamp'] = symbol[0]

        detail = response.xpath('//div[contains(@class, "detailBox")]/p')
        if detail:
            item['name'] = get_content(detail[0].xpath('text()').extract())
            item['launch_time'] = get_content(detail[4].xpath('text()').extract())
            item['location'] = get_content(detail[3].xpath('text()').extract())

        record = response.xpath('//div[@class="recordHead"]/div[@class="con"]/p')
        if record:
            item['exponent'] = get_content(record.xpath('span[@class="num"]/text()').extract())

        exp = response.xpath('//div[contains(@class, "expBox")]/div[@class="bd"]/div[@class="detail"]/p')
        if not exp: return None
        item['deal'] = get_content(exp[0].xpath('span[@class="num"]/text()').extract())
        item['popularity'] = get_content(exp[1].xpath('span[@class="num"]/text()').extract())
        item['profit'] = get_content(exp[2].xpath('span[@class="num"]/text()').extract())
        item['revenue'] = get_content(exp[3].xpath('span[@class="num"]/text()').extract())
        item['lever'] = get_content(exp[4].xpath('span[@class="num"]/text()').extract())
        item['brand'] = get_content(exp[5].xpath('span[@class="num"]/text()').extract())
        item['dispersity'] = get_content(exp[7].xpath('span[@class="num"]/text()').extract())
        item['mobility'] = get_content(exp[8].xpath('span[@class="num"]/text()').extract())
        item['transparency'] = get_content(exp[6].xpath('span[@class="num"]/text()').extract())

        log_empty_fields(item, self.logger)
        return item
Example #52
0
    def parse(self, response):
        item_list = []
        if response.url.endswith('html'):
            # For Regular Platform.
            content = response.xpath('//div[@id="platList"]/div[starts-with(@class, "rnav")]')
            for sel_ct in content:
                province_name = get_content(sel_ct.xpath('div[@class="til"]/div/p[not(@class="til_num")]/text()').extract())
                province_id = ProvinceItem.get_id_by_name(province_name)

                plat_list = sel_ct.xpath('ul[@class="til_cn"]/li')
                for sel_pt in plat_list:
                    daohang = DaohangItem()
                    purl = get_content(sel_pt.xpath('a/@purl').extract()).split('/')
                    while not purl[-1]: purl.pop()
                    daohang['pin'] = purl.pop()
                    daohang['name'] = get_content(sel_pt.xpath('a/text()').extract())
                    daohang['link'] = get_content(sel_pt.xpath('a/@href').extract())
                    daohang['province_id'] = province_id

                    item_list.append(daohang)

            # For Problematic Platform.
            # Disabled Here Temporarily.
            #content = response.xpath('//div[@id="issuePlatList"]/div[starts-with(@class, "rnav")]')
            #for sel_ct in content:
            #    province_name = get_content(sel_ct.xpath('div[@class="til"]/div/p[not(@class="til_num")]/text()').extract())
            #    province_id = ProvinceItem.get_id_by_name(province_name)

            #    plat_list = sel_ct.xpath('ul[@class="til_cn"]/li')
            #    for sel_pt in plat_list:
            #        daohang = DaohangItem()
            #        purl = get_content(sel_pt.xpath('a/@purl').extract()).split('/')
            #        while not purl[-1]: purl.pop()
            #        daohang['pin'] = purl.pop()
            #        daohang['name'] = get_content(sel_pt.xpath('a/text()').extract())
            #        # Invalid Link For Problematic Platform.
            #        #daohang['link'] = get_content(sel_pt.xpath('a/@href').extract())
            #        daohang['province_id'] = province_id

            #        item_list.append(daohang)
        else:
            content = json.loads(response.body_as_unicode())
            if response.url.endswith('json'):
                for ct in content:
                    daohang = DaohangItem()
                    daohang['pin']    = ct.get('platPin', None)
                    daohang['allPin'] = ct.get('allPlatPin', None)
                    daohang['name']   = ct.get('platName', None)
                    daohang['link']   = ct.get('platUrl', None)

                    item_list.append(daohang)
            else:
                for ct in content:
                    if not ct.get('city'): continue

                    province_id = ProvinceItem.get_id_by_name(ct.get('city'))
                    plat_list = ct.get('platList')
                    for pt in plat_list:
                        daohang = DaohangItem()
                        daohang['pin']         = pt.get('platLetter', None)
                        daohang['name']        = pt.get('platName', None)
                        daohang['link']        = pt.get('platUrl', None)
                        daohang['province_id'] = province_id
                        daohang['launch_time'] = pt.get('onlineDateStr', None)
                        daohang['icon_url']    = pt.get('platIconUrl', None)

                        item_list.append(daohang)

        return item_list
Example #53
0
    def parse(self, response):
        self.logger.info('Parsing cfda drug info From <%s>.' % response.url)
        item = CFDADrug()

        elements = response.xpath('//div[@class="listmain"]/div/table[1]/tr')

        if len(elements) > 13:
            item['url_id'] = int(response.url.split('=')[-1])
            item['approval_num'] = get_content(elements[1].xpath('td[2]/text()').extract())
            item['name'] = get_content(elements[2].xpath('td[2]/text()').extract())
            item['en_name'] = get_content(elements[3].xpath('td[2]/text()').extract())
            item['trade_name'] = get_content(elements[4].xpath('td[2]/text()').extract())
            item['dosage_forms'] = get_content(elements[5].xpath('td[2]/text()').extract())
            item['norm'] = get_content(elements[6].xpath('td[2]/text()').extract())
            item['producer'] = get_content(elements[7].xpath('td[2]/a/text()').extract())
            item['product_address'] = get_content(elements[8].xpath('td[2]/text()').extract())
            item['type'] = get_content(elements[9].xpath('td[2]/text()').extract())
            item['origin_approval_num'] = get_content(elements[10].xpath('td[2]/text()').extract())
            item['approval_date'] = get_content(elements[11].xpath('td[2]/text()').extract())
            item['drug_based_code'] = get_content(elements[12].xpath('td[2]/text()').extract())
            item['remark'] = get_content(elements[13].xpath('td[2]/text()').extract())
            return item
        else:
            return None
Example #54
0
    def parse(self, response):
        #NOTE: (zacky, 2015.APR.27th) PIPELINE FUNCTIONS RELATED WILL BE PROCESSED, SO WE KEEP THE OBJECT STATE HERE.
        symbol = (self.mapping.get(self.get_pin_from_url(response.url)), response.url)
        self.logger.info('Parsing ID.%d Wangjia Archive From <%s>.' % symbol)
        self.object = DaohangItem.get_object_by_pk(symbol[0])

        item = DanganItem()
        item['name'] = self.object.name
        item['logo_url'] = get_content(response.xpath('//div[@class="rLogo"]/a/img/@src').extract())

        detail = response.xpath('//div[contains(@class, "detailBox")]/p')
        if detail:
            item['link'] = get_content(detail[1].xpath('a/@href').extract())
            item['location'] = get_content(detail[3].xpath('text()').extract())
            item['launch_time'] = get_content(detail[4].xpath('text()').extract())

        about = response.xpath('//div[contains(@class, "aboutBd")]/p')
        if about:
            item['introduction'] = ' '.join([get_trunk(c) for c in about.xpath('.//text()').extract()])

        info = response.xpath('//div[contains(@class, "inforBd")]/p[not(contains(@class, "line"))]')
        if info:
            item['company_name'] = get_content(info[0].xpath('text()').extract())
            item['artificial_person'] = get_content(info[1].xpath('text()').extract())
            item['company_type'] = get_content(info[2].xpath('text()').extract())
            item['shareholder_stucture'] = get_content(info[3].xpath('text()').extract())
            item['registered_capital'] = get_content(info[4].xpath('text()').extract())
            item['contributed_capital'] = get_content(info[5].xpath('text()').extract())
            item['registered_address'] = get_content(info[6].xpath('text()').extract())
            item['opening_date'] = get_content(info[7].xpath('text()').extract())
            item['approved_date'] = get_content(info[8].xpath('text()').extract())
            item['registration_authority'] = get_content(info[9].xpath('text()').extract())
            item['business_licence'] = get_content(info[10].xpath('text()').extract())
            item['institutional_framework'] = get_content(info[11].xpath('text()').extract())
            item['tax_registration_num'] = get_content(info[12].xpath('text()').extract())

        record = response.xpath('//div[contains(@class, "webRecordBd")]/table/tbody/tr')[1].xpath('td')
        if record:
            item['domain_name'] = get_content(record[0].xpath('text()').extract())
            item['domain_date'] = get_content(record[1].xpath('text()').extract())
            item['domain_company_type'] = get_content(record[2].xpath('text()').extract())
            item['domain_company_name'] = get_content(record[3].xpath('text()').extract())
            item['icp'] = get_content(record[4].xpath('text()').extract())

        people = response.xpath('//div[contains(@class, "peopleBd")]/ul/li')
        if people:
            avatar_url = []
            content = []
            for i in xrange(len(people)):
                avatar_url.extend(people[i].xpath('div[@class="avatar"]/img/@src').extract())
                content.extend([get_trunk(c) for c in people[i].xpath('p//text()').extract()])
            item['company_person_avatar_url'] = '#'.join(avatar_url)
            item['company_person'] = ' '.join(content)

        cost = response.xpath('//div[contains(@class, "costBd")]')[0].xpath('p')
        if cost:
            item['management_fee'] = get_content(cost[0].xpath('text()').extract())
            item['prepaid_fee'] = get_content(cost[1].xpath('text()').extract())
            item['cash_withdrawal_fee'] = get_content(cost[2].xpath('text()').extract())
            item['vip_fee'] = get_content(cost[3].xpath('text()').extract())
            item['transfer_fee'] = get_content(cost[4].xpath('text()').extract())
            item['mode_of_payment'] = get_content(cost[5].xpath('text()').extract())

        contact = response.xpath('//div[contains(@class, "costBd")]')[1].xpath('p')
        if contact:
            item['contact_address'] = get_content(contact[0].xpath('text()').extract())
            item['phone_400'] = get_content(contact[1].xpath('text()').extract())
            item['phone'] = get_content(contact[2].xpath('text()').extract())
            item['fax'] = get_content(contact[3].xpath('text()').extract())
            item['email'] = get_content(contact[4].xpath('text()').extract())

        record = response.xpath('//div[contains(@class, "recordListBox")]/ul/li')
        if record:
            item['is_automatic_bid'] = get_content(record[3].xpath('.//text()').extract(), skipFirst=True)
            item['is_equitable_assignment'] = get_content(record[4].xpath('.//text()').extract(), skipFirst=True)
            item['trust_fund'] = get_content(record[5].xpath('.//text()').extract(), skipFirst=True)
            item['tender_security'] = get_content(record[6].xpath('.//text()').extract(), skipFirst=True)
            item['security_mode'] = get_content(record[7].xpath('.//text()').extract(), skipFirst=True)
            item['guarantee_institution'] = get_content(record[8].xpath('.//text()').extract(), skipFirst=True)
            item['business_type'] = len(record) >= 10 and get_content(record[9].xpath('.//text()').extract(), skipFirst=True)

        log_empty_fields(item, self.logger)
        return item