def parse(self, response: scrapy.http.Response): print("Parsing URL: " + response.url) # Call Splash only once per page (that contains multiple XML elements). data = self.getUrlData(response.url) response.meta["rendered_data"] = data # We would use .fromstring(response.text) if the response did not include the XML declaration: # <?xml version="1.0" encoding="utf-8"?> root = etree.XML(response.body) tree = etree.ElementTree(root) # If results are returned. elements = tree.xpath("/root/items/*") if len(elements) > 0: for element in elements: copyResponse = response.copy() element_xml_str = etree.tostring(element, pretty_print=True, encoding="unicode") element_dict = xmltodict.parse(element_xml_str) # Temporary solution for public-only content. # TODO: remove this when licensed content are enabled! if not self.is_public(element_dict["data"]): continue # TODO: It's probably a pointless attribute. # del element_dict["data"]["score"] # Passing the dictionary for easier access to attributes. copyResponse.meta["item"] = element_dict["data"] # In case JSON string representation is preferred: # copyResponse._set_body(json.dumps(copyResponse.meta['item'], indent=1, ensure_ascii=False)) copyResponse._set_body(element_xml_str) if self.hasChanged(copyResponse): yield self.handleEntry(copyResponse) # LomBase.parse() has to be called for every individual instance that needs to be saved to the database. LomBase.parse(self, copyResponse) # TODO: To not stress the Rest APIs. # time.sleep(0.1) # If the number of returned results is equal to the imposed limit, it means that there are more to be returned. if len(elements) == self.limit: self.page += 1 url = self.apiUrl.replace("%start", str(self.page * self.limit)).replace( "%anzahl", str(self.limit)) yield scrapy.Request( url=url, callback=self.parse, headers={ "Accept": "application/xml", "Content-Type": "application/xml", }, )
def parse(self, response: scrapy.http.Response): # Call Splash only once per page (that contains multiple XML elements). data = self.getUrlData(response.url) response.meta["rendered_data"] = data elements = json.loads(response.body_as_unicode()) for i, element in enumerate(elements): copyResponse = response.copy() # Passing the dictionary for easier access to attributes. copyResponse.meta["item"] = element # In case JSON string representation is preferred: json_str = json.dumps(element, indent=4, sort_keys=True, ensure_ascii=False) copyResponse._set_body(json_str) print(json_str) if self.hasChanged(copyResponse): yield self.handleEntry(copyResponse) # LomBase.parse() has to be called for every individual instance that needs to be saved to the database. LomBase.parse(self, copyResponse)
def parse_content_api_json(self, response: scrapy.http.Response): # Build up an item md = json.loads(response.body) item = LandingPageItem() item['landing_page_url'] = response.url item['title'] = md['title'] item['first_publication_date'] = md['first_published_at'] item['publication_date'] = md['first_published_at'] # Pick up the 'public updated' date instead, if present: if 'public_updated_at' in md: item['publication_date'] = md['public_updated_at'] item['publishers'] = [] for org in md['links']['organisations']: item['publishers'].append(org['title']) # item['publisher_metadata'] = md # Make a response object to make it easy to parse the HTML fragments in the API: resp = response.copy() # Go through the documents: item['documents'] = [] for doc in md['details']['documents']: resp._set_body(doc) doc_item = DocumentItem() doc_item['title'] = resp.css('.title ::text').extract_first() doc_item['document_url'] = response.urljoin( resp.css( '.attachment-details a::attr(href) ').extract_first()) doc_item['isbn'] = resp.css( 'span[class=isbn] ::text').extract_first() doc_item['command_paper_number'] = resp.css( 'span[class=command_paper_number] ::text').extract_first() doc_item['house_of_commons_paper_number'] = resp.css( 'span[class=house_of_commons_paper_number] ::text' ).extract_first() item['documents'].append(dict(doc_item)) # Return the composite ite: yield item
def parse(self, response: scrapy.http.Response): def contains_exceptions(cat_tree): return bool(set(cat_tree) & set(self.cat_2_exceptions)) page_type = AlloSpider.get_page_type(response) if page_type == PageType.OTHER: self.logger.info('Skipping page (unknown type) ' + response.url) return elif page_type == PageType.NO_GOODS: self.logger.info('Skipping page (no goods) ' + response.url) return if contains_exceptions(response.meta.get(Names.CAT_TREE_KEY)): self.logger.info('Skipping page (Exception) ' + response.url) return parser = MainAlloParser(self, response.copy()) if page_type == PageType.PRODUCTS: return parser.yield_products() elif page_type == PageType.CATALOG: return parser.yield_catalog() elif page_type == PageType.GOOD: return parser.yield_item()
def check_for_dead_ends_before_parsing(self, response: scrapy.http.Response): """ Checks if the current response.url has already been parsed or is on the "skip_these_urls"-list. If the current url hasn't been parsed already, copies the response and calls parse to gather metadata from the current .html :param response: :return: """ if response is not None: # Only call the parse method if the current url is no dead-end without content: table_body = response.xpath('//table[@class="smalltable"]') if table_body is not None: no_entry_regex = re.compile(r'Bisher kein Eintrag') for table_item in table_body: if (no_entry_regex.search(table_item.get())) is not None: self.debug_dead_end_counter += 1 # print("The URL", response.url, "is a 'Bisher kein Eintrag'-dead-end.") # print("check_for_dead_ends... Method: already parsed URLs =", len(self.debug_parsed_urls), # "| gathered urls =", len(self.navigation_urls), "| skip_these_urls =", # len(self.skip_these_urls), "| Total amount of dead-ends:", self.debug_dead_end_counter) # check if the current url has already been parsed: elif (response.url not in self.debug_parsed_urls) and (response is not None): # check if current url contains an undesired url-pattern skip_check = False for url_pattern in self.skip_these_urls: current_regex = re.compile(url_pattern) if current_regex.search(response.url) is not None: skip_check = True # if the current url is a "fresh" one, call the parse method to extract metadata if skip_check is False: # print("URL TO BE PARSED: ", response.url) self.debug_parsed_urls.add(response.url) response_copy = response.copy() yield from self.parse(response_copy)