def parse(self, response: scrapy.http.Response): # Call Splash only once per page (that contains multiple XML elements). data = self.getUrlData(response.url) response.meta["rendered_data"] = data elements = json.loads(response.body_as_unicode()) # grouped_elements = self.group_elements_by_medium_id(elements) grouped_elements = self.group_elements_by_sammlung(elements) for i, element in enumerate(grouped_elements): copyResponse = response.copy() # Passing the dictionary for easier access to attributes. copyResponse.meta["item"] = element # In case JSON string representation is preferred: json_str = json.dumps(element, indent=4, sort_keys=True, ensure_ascii=False) copyResponse._set_body(json_str) print(json_str) if self.hasChanged(copyResponse): yield self.handleEntry(copyResponse) # LomBase.parse() has to be called for every individual instance that needs to be saved to the database. LomBase.parse(self, copyResponse)
def parse(self, response: scrapy.http.Response): print("Parsing URL: " + response.url) # Call Splash only once per page (that contains multiple XML elements). data = self.getUrlData(response.url) response.meta["rendered_data"] = data # We would use .fromstring(response.text) if the response did not include the XML declaration: # <?xml version="1.0" encoding="utf-8"?> root = etree.XML(response.body) tree = etree.ElementTree(root) # If results are returned. elements = tree.xpath("/root/items/*") if len(elements) > 0: for element in elements: copyResponse = response.copy() element_xml_str = etree.tostring(element, pretty_print=True, encoding="unicode") element_dict = xmltodict.parse(element_xml_str) # Temporary solution for public-only content. # TODO: remove this when licensed content are enabled! if not self.is_public(element_dict["data"]): continue # TODO: It's probably a pointless attribute. # del element_dict["data"]["score"] # Passing the dictionary for easier access to attributes. copyResponse.meta["item"] = element_dict["data"] # In case JSON string representation is preferred: # copyResponse._set_body(json.dumps(copyResponse.meta['item'], indent=1, ensure_ascii=False)) copyResponse._set_body(element_xml_str) if self.hasChanged(copyResponse): yield self.handleEntry(copyResponse) # LomBase.parse() has to be called for every individual instance that needs to be saved to the database. LomBase.parse(self, copyResponse) # TODO: To not stress the Rest APIs. # time.sleep(0.1) # If the number of returned results is equal to the imposed limit, it means that there are more to be returned. if len(elements) == self.limit: self.page += 1 url = self.apiUrl.replace("%start", str(self.page * self.limit)).replace( "%anzahl", str(self.limit)) yield scrapy.Request( url=url, callback=self.parse, headers={ "Accept": "application/xml", "Content-Type": "application/xml", }, )
def parse(self, response: scrapy.http.Response): elements = json.loads(response.body_as_unicode()) prepared_elements = [ self.prepare_element(element_dict) for element_dict in elements ] collection_elements = self.prepare_collections(prepared_elements) for i, element_dict in enumerate(collection_elements): copyResponse = response.copy() # Passing the dictionary for easier access to attributes. copyResponse.meta["item"] = element_dict # In case JSON string representation is preferred: json_str = json.dumps(element_dict, indent=4, sort_keys=True, ensure_ascii=False) copyResponse._set_body(json_str) if self.hasChanged(copyResponse): yield self.handleEntry(copyResponse) # LomBase.parse() has to be called for every individual instance that needs to be saved to the database. LomBase.parse(self, copyResponse)
def parseEntry(self, response): if self.get("language", response=response) == "de": return LomBase.parse(self, response) logging.info( "Skpping entry with language " + self.get("language", response=response) ) return None
def parse(self, response): rows = self.readCSV( csv.reader(StringIO(response.body.decode("UTF-8")), delimiter=","), 2) for row in rows: copyResponse = response.copy() copyResponse.meta["row"] = row if self.getId(copyResponse): yield LomBase.parse(self, copyResponse)
def parse(self, response): data = json.loads(response.body_as_unicode()) if len(data["nodes"]) > 0: for item in data["nodes"]: copyResponse = response.replace(url=item["content"]["url"]) copyResponse.meta["item"] = item if self.hasChanged(copyResponse): yield LomBase.parse(self, copyResponse) yield self.search(data["pagination"]["from"] + data["pagination"]["count"])
def handleEntry(self, response): return LomBase.parse(self, response)
def parse(self, response): return LomBase.parse(self, response)
def parseRecord(self, response): lom = LomBase.parse(self, response) return lom
def startHandler(self, response): for item in response.xpath('//rss/channel/item'): responseCopy = response.copy() responseCopy.meta['item'] = item yield LomBase.parse(self, response)
def startHandler(self, response): for item in response.xpath("//rss/channel/item"): responseCopy = response.replace(url=item.xpath("link//text()").get()) responseCopy.meta["item"] = item yield LomBase.parse(self, responseCopy)
def parse(self, response): print(response.url) return LomBase.parse(self, response)
def parseEntry(self, response): if self.get('language', response = response) == 'de': return LomBase.parse(self, response) logging.info('Skpping entry with language ' + self.get('language', response = response)) return None
def parse(self, response: scrapy.http.Response): print("Parsing URL: " + response.url) # Call Splash only once per page (that contains multiple XML elements). data = self.getUrlData(response.url) response.meta["rendered_data"] = data # We would use .fromstring(response.text) if the response did not include the XML declaration: # <?xml version="1.0" encoding="utf-8"?> root = etree.XML(response.body) tree = etree.ElementTree(root) # pbar works even with self.page > 0. if self.pbar is None: total_elements = int(tree.xpath('/root/sum')[0].text) remaining_elements = total_elements - self.page * self.limit self.pbar = tqdm(total=(remaining_elements), desc=self.name + " downloading progress: ", initial=self.page * self.limit) # If results are returned. elements = tree.xpath('/root/items/*') if len(elements) > 0: for element in elements: self.pbar.update(1) copyResponse = response.copy() element_xml_str = etree.tostring(element, pretty_print=True, encoding='unicode') element_dict = xmltodict.parse(element_xml_str) # TODO: It's probably a pointless attribute. #del element_dict["data"]["score"] # Passing the dictionary for easier access to attributes. copyResponse.meta['item'] = element_dict["data"] # In case JSON string representation is preferred: # copyResponse._set_body(json.dumps(copyResponse.meta['item'], indent=1, ensure_ascii=False)) copyResponse._set_body(element_xml_str) if self.hasChanged(copyResponse): yield self.handleEntry(copyResponse) # LomBase.parse() has to be called for every individual instance that needs to be saved to the database. LomBase.parse(self, copyResponse) # TODO: To not stress the Rest APIs. # time.sleep(0.1) # If the number of returned results is equal to the imposed limit, it means that there are more to be returned. if len(elements) == self.limit: self.page += 1 url = self.apiUrl.replace('%start', str(self.page * self.limit)).replace( '%anzahl', str(self.limit)) yield scrapy.Request(url=url, callback=self.parse, headers={ 'Accept': 'application/xml', 'Content-Type': 'application/xml' })
def handleLink(self, response): return LomBase.parse(self, response)
def parse(self, response: scrapy.http.Response): print("Parsing URL: " + response.url) # Call Splash only once per page (that contains multiple XML elements). data = self.getUrlData(response.url) response.meta["rendered_data"] = data # We would use .fromstring(response.text) if the response did not include the XML declaration: # <?xml version="1.0" encoding="utf-8"?> root = etree.XML(response.body) tree = etree.ElementTree(root) # Get the total number of possible elements elements_total = int(tree.xpath('/root/sum')[0].text) # If results are returned. elements = tree.xpath("/root/items/*") if len(elements) > 0: for element in elements: copyResponse = response.copy() element_xml_str = etree.tostring(element, pretty_print=True, encoding="unicode") try: element_dict = xmltodict.parse(element_xml_str) element_dict = element_dict["data"] # Preparing the values here helps for all following logic across the methods. self.prepare_element(element_dict) # If there is no available county (Kreis) code, then we do not want to deal with this element. if not ("county_ids" in element_dict and element_dict["county_ids"] is not None and len(element_dict["county_ids"]) > 0): continue # TODO: It's probably a pointless attribute. # del element_dict["data"]["score"] # Passing the dictionary for easier access to attributes. copyResponse.meta["item"] = element_dict # In case JSON string representation is preferred: # copyResponse._set_body(json.dumps(copyResponse.meta['item'], indent=1, ensure_ascii=False)) copyResponse._set_body(element_xml_str) if self.hasChanged(copyResponse): yield self.handleEntry(copyResponse) # LomBase.parse() has to be called for every individual instance that needs to be saved to the database. LomBase.parse(self, copyResponse) except Exception as e: print("Issues with the element: " + str(element_dict["id_local"]) if "id_local" in element_dict else "") print(str(e)) current_expected_count = (self.page + 1) * self.limit # TODO: To not stress the Rest APIs. # time.sleep(0.1) # If we are below the total available numbers continue fetching more pages. if current_expected_count < elements_total: self.page += 1 url = self.apiUrl.replace("%start", str(self.page * self.limit)).replace( "%anzahl", str(self.limit)) yield scrapy.Request( url=url, callback=self.parse, headers={ "Accept": "application/xml", "Content-Type": "application/xml", }, )
def parse(self, response): if not self.get('description', response = response): logging.info('skipping empty entry in serlo') return None return LomBase.parse(self, response)
def handleEntry(self, response): response.meta["item"] = json.loads(response.body_as_unicode()) return LomBase.parse(self, response)