def parse(self, response): sel = Selector(response) locations = Locations() locations["restaurantIDs"] = sel.xpath('//a/@data-id').extract() locations["coordinates"] = {} locations["coordinates"]["longitude"] = self.coordinatesURLTranslator.getLongitude(response.url) locations["coordinates"]["latitude"] = self.coordinatesURLTranslator.getLatitude(response.url) return locations
def parse_item(self, response): index = response.meta['index'] if index == 1: index_count = response.selector.xpath('//*[@id="m-page"]/span/text()').extract() index_count = [x.strip() for x in index_count if x.strip()] index, count = [int(x) for x in index_count[0].split('/')] for i in range(index + 1, count + 1): yield Request(url=self.get_gn_url(i), headers=TONGHUASHUN_GN_HEADER, meta={'index': i}, callback=self.parse_item) trs = response.xpath('/html/body/table/tbody//tr').extract() try: for tr in trs: start_date = Selector(text=tr).xpath('//td[1]/text()').extract_first() name = Selector(text=tr).xpath('//td[2]/a/text()').extract_first() link = Selector(text=tr).xpath('//td[2]/a/@href').extract_first() news_title = Selector(text=tr).xpath('//td[3]/a/text()').extract_first() news_link = Selector(text=tr).xpath('//td[3]/a/@href').extract_first() leadings = [x.rsplit('/')[-2] for x in Selector(text=trs[0]).xpath('//td[4]/a/@href').extract()] count = Selector(text=tr).xpath('//td[5]/text()').extract() yield SectorItem(id='{}_{}_{}'.format('10jqka', 'gn', name), start_date=start_date, name=name, link=link, news_title=news_title, news_link=news_link, leadings=leadings, count=count, producer='10jqka', type='gn') except Exception as e: self.logger.error('error parse 10jqka gainian sector url:{} {}'.format(response.url, e))
def parse_XML(self, response): if not hasattr(self, 'parse_node'): raise NotConfigured('You must define parse_node method in order to scrape this XML feed') response = self.adapt_response(response) if self.iterator == 'iternodes': nodes = self._iternodes(response) elif self.iterator == 'xml': selector = Selector(response, type='xml') self._register_namespaces(selector) nodes = selector.xpath('//%s' % self.itertag) elif self.iterator == 'html': selector = Selector(response, type='html') self._register_namespaces(selector) nodes = selector.xpath('//%s' % self.itertag) else: raise NotSupported('Unsupported node iterator') return self.parse_nodes(response, nodes)
def detail_parse(self, response): page = response.meta['page'] token = json.loads(requests.post(self.token_url, headers=self.header).text, strict=False).get('d', '') data = copy.deepcopy(self.data) data.update({'Token': token, 'PageIndex': str(page)}) list_content = json.loads(requests.post(self.list_url, headers=self.header, json=data).text, strict=False).get('d', '') cont_list = json.loads(list_content).get('Table', []) for cont in cont_list: result_dict = {} info_id = cont.get('InfoID', '') post_data = { "Token": json.loads(requests.post(self.token_url, headers=self.header).text, strict=False).get('d', ''), "PageIndex": "1", "PageSize": "1", "InfoID": info_id } detail_content = json.loads(requests.post(self.detail_url, headers=self.header, json=post_data).text, strict=False).get('d', '') detail = json.loads(detail_content, strict=False).get('Table', [])[0] result_dict['punish_code'] = detail.get('name1', '') result_dict['case_name'] = detail.get('name2', '') result_dict['punish_category_one'] = detail.get('name3', '') result_dict['punish_category_two'] = detail.get('name4', '') result_dict['punish_type'] = detail.get('name5', '') result_dict['punish_basis'] = detail.get('name6', '') result_dict['company_name'] = detail.get('name7', '') result_dict['credit_code'] = detail.get('name8', '') result_dict['organization_code'] = detail.get('name9', '') result_dict['regno'] = detail.get('name10', '') result_dict['tax_code'] = detail.get('name11', '') result_dict['id_number'] = detail.get('name12', '') result_dict['frname'] = detail.get('name13', '') result_dict['punish_content'] = detail.get('name14', '') result_dict['public_date'] = detail.get('name15', '') result_dict['punish_org'] = detail.get('name16', '') result_dict['update'] = detail.get('infodate', '') for key, value in result_dict.items(): result_dict[key] = ''.join(Selector(text=value).xpath('//p//text()').extract()).strip()\ if '<p style' in value else value yield self.handle_result(response, result_dict, info_id)
def parse_store(self, response, js): props = {} props["addr_full"] = Selector(text=js["address"]).xpath("//p/text()").get() props["ref"] = js["url_title"] props["lat"] = js["coordinates"][0] props["lon"] = js["coordinates"][1] props["city"] = js["city"] props["state"] = js["state"] props["postcode"] = js["zip"] props["phone"] = js["phone_number"] hours = response.css(".hours p:not(:empty)").xpath("text()").get() props["opening_hours"] = hours return GeojsonPointItem(**props)
def parse(self, response): sel = Selector(response) restaurants = sel.xpath('//a[contains(@id, "establecimiento")]') for restaurant in restaurants: locationCsv = LocationCsv() locationCsv["id_restaurante"] = restaurant.css( "a::attr(data-id)").extract() locationCsv["nombre_restaurante"] = restaurant.css( "a .result-info h4::text").extract() locationCsv["latitud"] = self.coordinatesURLTranslator.getLatitude( response.url) locationCsv[ "longitud"] = self.coordinatesURLTranslator.getLongitude( response.url) yield locationCsv
def _validate_response(self, response: Union[Response, str]) -> bool: """ :param response: :type response: Response :return: :rtype: bool """ if isinstance(response, str): response: Selector = Selector(text=response) response: Union[Response, Selector] names_in_meta: List[str] = response.xpath("/html/head/meta").xpath( "@name").extract() return "ROBOTS" not in names_in_meta
def parse_quick_facts(self, selector: Selector, quest: Quest): """ parses the quick facts section on a wowhead quest page :param selector: selector of the quick facts section :param quest: quest item to store gathered info in :return: """ result = selector.re(r"Start:\s(.*</a>)") if result: element = Selector(text=result[0]) quest["npc"] = element.xpath("//a/text()").get() quest["npc_link"] = self.base_url + element.xpath( "//a/@href").get() else: quest["npc"] = "Unknown" quest["npc_link"] = "Unknown"
def parse(self, response): sel = Selector(response) sites = sel.xpath('//div[@class="mainleft"]') itemlist= [] for site in sites: item = CnkispiderItem() title = site.xpath('//*[@id="chTitle"]/text()').extract() #将相应的值填入到item对应的属性中去 item['title'] = [t.encode('utf-8') for t in title] author = site.xpath('//*[@id="content"]/div[1]/div[3]/div[2]/p[1]/a/text()').extract() if author == None: author = site.xpath('//*[@id="content"]/div[1]/div[2]/p[1]/a/text()').extract() item['author'] = [a.encode('utf-8') for a in author] institution = site.xpath('//*[@id="content"]/div[1]/div[3]/div[2]/p[3]/a/text()').extract() item['institution'] = [i.encode('utf-8') for i in institution] abstract = site.xpath('//*[@id="ChDivSummary"]/text()').extract() item['abstract'] = [a.encode('utf-8') for a in abstract] keyWord = site.xpath('//*[@id="ChDivKeyWord"]/a/text()').extract() item['keyWord'] = [k.encode('utf-8') for k in keyWord] downloadFreq = site.xpath('//*[@id="content"]/div[1]/div[5]/ul/li/text()').re(u'\s*【下载频次】(.*)') item['downloadFreq'] = [d.encode('utf-8') for d in downloadFreq] quoteFreq = site.xpath('//*[@id="rc3"]/text()').re('\W(\d+)\W') item['quoteFreq'] = [q.encode('utf-8') for q in quoteFreq] itemlist.append(item) #加入日志记录,级别为info log.msg("Appending item...", level=log.INFO) #生成日志 log.msg("Append done.", level=log.INFO) return itemlist # if __name__ == "__main__": # sys.path.append('F:\Pythonworkspace\cnkiSpider_master\cnkiSpider\cnkiSpider') # cnki = CNKI_Spiders() # # print os.getcwd() # print cnki #
def test_parsel_parse_and_extract(self): for i in range(ITERATIONS): for name, page in ibl_pages.items(): s = Selector(text=page.body) extract(parsel_extractors[name], s)
schema = FakeContainer(descriptors['#default']) validate = schema._validate_and_adapt_item _names_map = {'daft_ie': 'daft', 'patchofland': 'pol'} ibl_extractors = {} ibl_pages = {} selector_pages = {} for template_name in ('daft_ie', 'hn', 'patchofland'): with open('%s/data/templates/%s.html' % (_PATH, template_name)) as f: html_page = HtmlPage(body=f.read().decode('utf-8')) name = _names_map.get(template_name, template_name) ibl_pages[name] = html_page ibl_extractors[name] = SlybotIBLExtractor([(html_page, descriptors, '0.13.0')]) selector_pages[name] = Selector(text=html_page.body) class TestExtractionSpeed(TestCase): def test_parsel_parse_and_extract(self): for i in range(ITERATIONS): for name, page in ibl_pages.items(): s = Selector(text=page.body) extract(parsel_extractors[name], s) def test_slybot_parse_and_extract(self): for i in range(ITERATIONS): for name, page in ibl_pages.items(): extraction_page = HtmlPage(body=page.body) ibl_extractors[name].extract(extraction_page)
def extractData(self, body, xpath): if isinstance(body, str): return Selector(text=body).xpath(xpath).extract() return Selector(response=body).xpath(xpath).extract()
def __init__(self, response): self.sel = Selector(response)