def _parse_item(self, response): domain_url = "http://www.chinairn.com/" reports = response.xpath("//p[@class='maintittle']") for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta["large_category"] item["industry_mid_category"] = response.meta["mid_category"] item["report_name"] = clean_text(report.xpath(".//text()").extract()[0].strip()) if parseIndustryName(item["report_name"]) != None: item["industry_small_chs_name"] = parseIndustryName(item["report_name"]) page_url = report.xpath(".//@href").extract()[0] item["report_link"] = urljoin(domain_url, page_url) item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"中国行业研究网" try: self.report_para(item, report) except: log.msg("Report revision time missed: %s"%item["report_link"], level=log.WARNING) item["price_free"] = False yield item Current_Page = clean_text(response.xpath(".//*[@class='hover']/text()").extract()[0]) if Page_Limit > 0 and int(Current_Page) > Page_Limit:return nextPage = response.xpath("//a[contains(@class,'down')]")[0] lastPageurl = nextPage.xpath("./following-sibling::a[1]/@href").extract()[0] nextPageurl = nextPage.xpath("./@href").extract()[0] if lastPageurl != nextPageurl: url = urljoin(self.base_url, nextPageurl) request = FormRequest(url, callback=self._parse_item, dont_filter=True) request.meta["large_category"] = response.meta["large_category"] request.meta["mid_category"] = response.meta["mid_category"] yield request
def _parse_free(self, response): reports = response.xpath(".//*[@class='tul3']//li") if len(reports)>0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta["large_category"] item["report_name"] = clean_text(report.xpath(".//a//text()").extract()[0].strip()) industry_small_chs_name = parseIndustryName(item["report_name"]) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] item["report_link"] = page_url report_content = self.parseTimeContent(page_url) if report_content != None: item["report_content"] = report_content report_time = clean_text(report.xpath(".//span//text()").extract()[0].strip()) if (len(report_time) > 0): item["report_revision_time"] = report_time date, date_precision = parse_date(item["report_revision_time"]) try: item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai')) except: pass item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"中国产业洞察网" item["price_free"] = True yield item
def _parse_research(self, response): reports = response.xpath(".//*[@id='ulNewsList']//li") if len(reports)>0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta["large_category"] item["industry_mid_category"] = response.meta["mid_category"] item["report_name"] = clean_text(report.xpath(".//dt//text()").extract()[0].strip()) industry_small_chs_name = parseIndustryName(item["report_name"]) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] item["report_link"] = page_url report_time = clean_text(report.xpath(".//*[@class='time']").extract()[0].strip()) if len(report_time) >0: item["report_revision_time"] = report_time.split(u":")[1] date, date_precision = parse_date(item["report_revision_time"]) try: item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai')) except: pass item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"中国产业洞察网" item["price_free"] = False yield item
def _parse_research(self, response): reports = response.xpath(".//*[@id='ulNewsList']//li") if len(reports) > 0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta[ "large_category"] item["industry_mid_category"] = response.meta["mid_category"] item["report_name"] = clean_text( report.xpath(".//dt//text()").extract()[0].strip()) industry_small_chs_name = parseIndustryName( item["report_name"]) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] item["report_link"] = page_url report_time = clean_text( report.xpath(".//*[@class='time']").extract()[0].strip()) if len(report_time) > 0: item["report_revision_time"] = report_time.split(u":")[1] date, date_precision = parse_date( item["report_revision_time"]) try: item["report_revision_time_standard"] = date.replace( tzinfo=pytz.timezone('Asia/Shanghai')) except: pass item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"中国产业洞察网" item["price_free"] = False yield item
def _parse_free(self, response): reports = response.xpath(".//*[@class='tul3']//li") if len(reports) > 0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta[ "large_category"] item["report_name"] = clean_text( report.xpath(".//a//text()").extract()[0].strip()) industry_small_chs_name = parseIndustryName( item["report_name"]) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] item["report_link"] = page_url report_content = self.parseTimeContent(page_url) if report_content != None: item["report_content"] = report_content report_time = clean_text( report.xpath(".//span//text()").extract()[0].strip()) if (len(report_time) > 0): item["report_revision_time"] = report_time date, date_precision = parse_date( item["report_revision_time"]) try: item["report_revision_time_standard"] = date.replace( tzinfo=pytz.timezone('Asia/Shanghai')) except: pass item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"中国产业洞察网" item["price_free"] = True yield item
def parse_item(self, response): item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta["large_category_name"] item["industry_mid_category"] = response.meta["mid_category_name"] item["report_name"] = clean_text(response.xpath("//h1/text()").extract()[0].strip()) item["report_type"] = response.meta["report_type"] item["industry_small_chs_name"] = parseIndustryName(item["report_name"]) item["price_free"] = self._parse_price(response) item["report_link"] = response.url item["source_domain"] = self.base_url item["source_name"] = u"中国产业发展研究网" yield item
def parse_item(self, response): item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta["large_category_name"] item["industry_mid_category"] = response.meta["mid_category_name"] item["report_name"] = clean_text( response.xpath("//h1/text()").extract()[0].strip()) item["report_type"] = response.meta["report_type"] item["industry_small_chs_name"] = parseIndustryName( item["report_name"]) item["price_free"] = self._parse_price(response) item["report_link"] = response.url item["source_domain"] = self.base_url item["source_name"] = u"中国产业发展研究网" yield item
def _parse_hy_large(self, response): large_categories = response.xpath(".//*[@class='yahei f16 fB']") for large_category in large_categories: large_category_name = clean_text( large_category.xpath(".//text()").extract()[0].strip()) if u"区域重点行业中小企业季报" not in large_category_name: page_url = large_category.xpath(".//@href").extract()[0] url = urljoin(self.base_url, page_url) request = FormRequest(url, callback=self._parse_hg_mid, dont_filter=True) request.meta["large_category"] = parseIndustryName( large_category_name) yield request
def _parse_item(self, response): reports = response.xpath(".//*[@class='info']") if len(reports) > 0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta[ "large_category"] item["industry_mid_category"] = response.meta["mid_category"] item["report_name"] = clean_text( report.xpath(".//h3//a/text()").extract()[0].strip()) industry_small_chs_name = parseIndustryName( item["report_name"]) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] url = urljoin(self.base_url, page_url) item["report_link"] = url string = clean_text( report.xpath(" //*[@class='rdate']//span/text()").extract( )[0].strip()) temp = self.parseItem(string) if len(temp) == 1: item["report_revision_time"] = temp[0][0] item["report_page_count"] = temp[0][1] item["report_graph_count"] = temp[0][2] date, date_precision = parse_date( item["report_revision_time"]) item["report_revision_time_standard"] = date.replace( tzinfo=pytz.timezone('Asia/Shanghai')) item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"中国投资咨询网" item["price_free"] = False yield item if_nextpage = response.xpath(".//*[@class='zw']") if len(if_nextpage) > 0: if (if_nextpage.xpath(".//text()").extract()[-1] ) == u'下一页': #存在翻页 page_url = if_nextpage.xpath(".//@href").extract()[-1] url = urljoin(self.base_url, page_url) request = FormRequest(url, callback=self._parse_item, dont_filter=True) request.meta["large_category"] = response.meta[ "large_category"] request.meta["mid_category"] = response.meta[ "mid_category"] yield request
def _parse_item(self, response): reports = response.xpath(".//*[@class='img_des']/a") if len(reports) > 0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta[ "large_category"] item["industry_mid_category"] = response.meta["mid_category"] item["report_name"] = clean_text( report.xpath("./text()").extract()[0].strip()) industry_small_chs_name = parseIndustryName( item["report_name"]) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] url = urljoin(self.base_url, page_url) item["report_link"] = url report_time = self.parseTime(item["report_link"]) if report_time != None: item["report_revision_time"] = report_time item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"欧咨网" date, date_precision = parse_date(item["report_revision_time"]) item["report_revision_time_standard"] = date.replace( tzinfo=pytz.timezone('Asia/Shanghai')) item["price_free"] = False yield item if len(response.xpath(".//*[@class='page']//@href")) > 1: #存在翻页 page_len = clean_text( response.xpath( ".//*[@class='page']//*[@class='fl_l']/text()"). extract()[0].strip()) nextPageurl = response.xpath( ".//*[@class='page']//@href").extract()[-1] finds = self.pattern_page.findall(page_len) currentPage = finds[0][0] totlePage = finds[0][1] if currentPage != totlePage: url = urljoin(self.base_url, nextPageurl) request = FormRequest(url, callback=self._parse_item, dont_filter=True) request.meta["large_category"] = response.meta[ "large_category"] request.meta["mid_category"] = response.meta[ "mid_category"] yield request
def _parse_item(self, response): reports = response.xpath(".//*[@class='info']") if len(reports)>0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta["large_category"] item["industry_mid_category"] = response.meta["mid_category"] item["report_name"] = clean_text(report.xpath(".//h3//a/text()").extract()[0].strip()) industry_small_chs_name = parseIndustryName(item["report_name"]) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] url = urljoin(self.base_url, page_url) item["report_link"] = url string =clean_text(report.xpath(" //*[@class='rdate']//span/text()").extract()[0].strip()) temp = self.parseItem(string) if len(temp)==1: item["report_revision_time"] = temp[0][0] item["report_page_count"] = temp[0][1] item["report_graph_count"] = temp[0][2] date, date_precision = parse_date(item["report_revision_time"]) item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai')) item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"中国投资咨询网" item["price_free"] = False yield item if_nextpage = response.xpath(".//*[@class='zw']") if len(if_nextpage)>0: if (if_nextpage.xpath(".//text()").extract()[-1])==u'下一页': #存在翻页 page_url =if_nextpage.xpath(".//@href").extract()[-1] url = urljoin(self.base_url, page_url) request = FormRequest(url, callback=self._parse_item, dont_filter=True) request.meta["large_category"] = response.meta["large_category"] request.meta["mid_category"] = response.meta["mid_category"] yield request
def _parse_item(self, response): reports = response.xpath(".//*[@class='clistdl']") for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta["large_category"] item["industry_mid_category"] = response.meta["mid_category"] item["report_name"] = clean_text(report.xpath(".//dt/a/text()").extract()[0].strip()) if len(report.xpath(".//dd//*[@class='cxgrep']//@title"))>0: industry = clean_text(report.xpath(".//dd//*[@class='cxgrep']//@title").extract()[0].strip()) else: industry = item["report_name"] industry_small_chs_name = parseIndustryName(industry) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] item["report_link"] = page_url item["report_revision_time"] = clean_text(report.xpath(".//dt/span/text()").extract()[0].strip()) item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"中国报告大厅" date, date_precision = parse_date(item["report_revision_time"]) item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai')) item["price_free"] = False yield item
def _parse_item(self, response): reports = response.xpath(".//*[@class='img_des']/a") if len(reports)>0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta["large_category"] item["industry_mid_category"] = response.meta["mid_category"] item["report_name"] = clean_text(report.xpath("./text()").extract()[0].strip()) industry_small_chs_name = parseIndustryName(item["report_name"]) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] url = urljoin(self.base_url, page_url) item["report_link"] = url report_time = self.parseTime(item["report_link"]) if report_time != None: item["report_revision_time"] = report_time item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"欧咨网" date, date_precision = parse_date(item["report_revision_time"]) item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai')) item["price_free"] = False yield item if len(response.xpath(".//*[@class='page']//@href"))>1: #存在翻页 page_len = clean_text(response.xpath(".//*[@class='page']//*[@class='fl_l']/text()").extract()[0].strip()) nextPageurl = response.xpath(".//*[@class='page']//@href").extract()[-1] finds = self.pattern_page.findall(page_len) currentPage = finds[0][0] totlePage = finds[0][1] if currentPage != totlePage: url = urljoin(self.base_url, nextPageurl) request = FormRequest(url, callback=self._parse_item, dont_filter=True) request.meta["large_category"] = response.meta["large_category"] request.meta["mid_category"] = response.meta["mid_category"] yield request