Beispiel #1
0
 def parse(self, response):
     x = XmlXPathSelector(response)
     zp_nodes = x.xpath("//stats")
     source = response.meta.get("source", "")
     for zp_node in zp_nodes:
         name = zp_node.xpath("////stats/stat/name/text()").extract()
         xy = zp_node.xpath("//stats/stat/xy/text()").extract()
         for i in range(len(name)):
             gz_item = GJZDItem()
             gz_item["name"] = name[i]
             gz_item["source"] = source
             gz_item["lng"] = xy[i].split(",")[0]
             gz_item["lat"] = xy[i].split(",")[1]
             yield gz_item
 def parse(self, response):
     item = ArxivOrgItem()
     xxs = XmlXPathSelector(response)
     xxs.remove_namespaces()
     # 需要先将selector对象格式化成str
     xml_data = str(xxs.xpath('//link'))
     #logging.log(logging.INFO, xml_data)
     url_list = re.findall('http://arxiv.org/abs/\d+.\d+', xml_data)
     #logging.log(logging.INFO, url_list)
     for url in url_list:
         logging.log(
             logging.INFO,
             f'**************** crawling link: {url} ***************** ')
         yield Request(url=url,
                       callback=self.parse_single_page,
                       meta={'item': item},
                       dont_filter=True)
Beispiel #3
0
 def parse(self, response):
     x = XmlXPathSelector(response)
     zp_nodes = x.xpath("//lines")
     count = 0
     for zp_node in zp_nodes:
         road = zp_node.xpath("//lines/line/name/text()").extract()
         stats = zp_node.xpath("//lines/line/stats/text()").extract()
         for i in range(len(road)):
             s = stats[i].split(";")
             for j in range(len(s)):
                 count += 1
                 zd_item = ZDCXItem()
                 zd_item["road"] = road[i]
                 zd_item["station_name"] = s[j]
                 zd_item["station_num"] = count
                 yield zd_item
             count = 0