def parse(self,item): name = self.sel.xpath(self.xpath_name).extract()[0].encode("utf-8") item["name"] = name.rstrip(" API") description = self.sel.xpath(self.xpath_description) item_filled(description,item,"description",True) tags = self.sel.xpath(self.xpath_tags) tags_str = "" if tags: tags = tags.extract() for tag in tags: tags_str += tag + "," tags_str = tags_str.rstrip(",") else: tags_str = None item["tags"] = tags_str summary_field = self.sel.xpath(self.xpath_summary_field) for summary_each_field in summary_field: label = summary_each_field.xpath('./label/text()').extract()[0].strip() summary_content = summary_each_field.xpath('./span/text()|./span/*/text()').extract() content = ''.join(summary_content) field = self.__switch_label(label) item[field] = content.encode("utf-8") return item
def parse(self, response): sel = Selector(response) api_url_list = sel.xpath('//td[@class="views-field views-field-title col-md-3"]/a/@href').extract() for api_url in api_url_list: updated_date = sel.xpath('//td/a[@href="'+api_url+'"]/../following-sibling::td[@class="views-field views-field-created"]/text()') api_item = API() api_item_init(api_item) api_item['pweb_link'] = self.base_url+api_url.strip().encode("utf-8") item_filled(updated_date,api_item,'updated_time',True) yield Request(url=self.base_url+api_url,callback=self.api_summary_parse,meta={'item':api_item,'type':'api_summary'},dont_filter=True) next_page_urls = sel.xpath('//a[@title="Go to next page"]/@href') if next_page_urls: next_page_url = next_page_urls.extract()[0] yield Request(self.base_url+next_page_url,self.parse,meta={'type':'api_page'},dont_filter=True)