def parse(self, response): x = XmlXPathSelector(response) zp_nodes = x.xpath("//stats") source = response.meta.get("source", "") for zp_node in zp_nodes: name = zp_node.xpath("////stats/stat/name/text()").extract() xy = zp_node.xpath("//stats/stat/xy/text()").extract() for i in range(len(name)): gz_item = GJZDItem() gz_item["name"] = name[i] gz_item["source"] = source gz_item["lng"] = xy[i].split(",")[0] gz_item["lat"] = xy[i].split(",")[1] yield gz_item
def parse(self, response): item = ArxivOrgItem() xxs = XmlXPathSelector(response) xxs.remove_namespaces() # 需要先将selector对象格式化成str xml_data = str(xxs.xpath('//link')) #logging.log(logging.INFO, xml_data) url_list = re.findall('http://arxiv.org/abs/\d+.\d+', xml_data) #logging.log(logging.INFO, url_list) for url in url_list: logging.log( logging.INFO, f'**************** crawling link: {url} ***************** ') yield Request(url=url, callback=self.parse_single_page, meta={'item': item}, dont_filter=True)
def parse(self, response): x = XmlXPathSelector(response) zp_nodes = x.xpath("//lines") count = 0 for zp_node in zp_nodes: road = zp_node.xpath("//lines/line/name/text()").extract() stats = zp_node.xpath("//lines/line/stats/text()").extract() for i in range(len(road)): s = stats[i].split(";") for j in range(len(s)): count += 1 zd_item = ZDCXItem() zd_item["road"] = road[i] zd_item["station_name"] = s[j] zd_item["station_num"] = count yield zd_item count = 0