def parse(self, response: scrapy.http.Response):

        # Call Splash only once per page (that contains multiple XML elements).
        data = self.getUrlData(response.url)
        response.meta["rendered_data"] = data
        elements = json.loads(response.body_as_unicode())

        # grouped_elements = self.group_elements_by_medium_id(elements)
        grouped_elements = self.group_elements_by_sammlung(elements)

        for i, element in enumerate(grouped_elements):
            copyResponse = response.copy()

            # Passing the dictionary for easier access to attributes.
            copyResponse.meta["item"] = element

            # In case JSON string representation is preferred:
            json_str = json.dumps(element,
                                  indent=4,
                                  sort_keys=True,
                                  ensure_ascii=False)
            copyResponse._set_body(json_str)
            print(json_str)

            if self.hasChanged(copyResponse):
                yield self.handleEntry(copyResponse)

            # LomBase.parse() has to be called for every individual instance that needs to be saved to the database.
            LomBase.parse(self, copyResponse)
    def parse(self, response: scrapy.http.Response):
        print("Parsing URL: " + response.url)

        # Call Splash only once per page (that contains multiple XML elements).
        data = self.getUrlData(response.url)
        response.meta["rendered_data"] = data

        # We would use .fromstring(response.text) if the response did not include the XML declaration:
        # <?xml version="1.0" encoding="utf-8"?>
        root = etree.XML(response.body)
        tree = etree.ElementTree(root)

        # If results are returned.
        elements = tree.xpath("/root/items/*")
        if len(elements) > 0:
            for element in elements:
                copyResponse = response.copy()
                element_xml_str = etree.tostring(element,
                                                 pretty_print=True,
                                                 encoding="unicode")
                element_dict = xmltodict.parse(element_xml_str)

                # Temporary solution for public-only content.
                # TODO: remove this when licensed content are enabled!
                if not self.is_public(element_dict["data"]):
                    continue

                # TODO: It's probably a pointless attribute.
                # del element_dict["data"]["score"]

                # Passing the dictionary for easier access to attributes.
                copyResponse.meta["item"] = element_dict["data"]

                # In case JSON string representation is preferred:
                # copyResponse._set_body(json.dumps(copyResponse.meta['item'], indent=1, ensure_ascii=False))
                copyResponse._set_body(element_xml_str)

                if self.hasChanged(copyResponse):
                    yield self.handleEntry(copyResponse)

                # LomBase.parse() has to be called for every individual instance that needs to be saved to the database.
                LomBase.parse(self, copyResponse)

        # TODO: To not stress the Rest APIs.
        # time.sleep(0.1)

        # If the number of returned results is equal to the imposed limit, it means that there are more to be returned.
        if len(elements) == self.limit:
            self.page += 1
            url = self.apiUrl.replace("%start",
                                      str(self.page * self.limit)).replace(
                                          "%anzahl", str(self.limit))
            yield scrapy.Request(
                url=url,
                callback=self.parse,
                headers={
                    "Accept": "application/xml",
                    "Content-Type": "application/xml",
                },
            )
Beispiel #3
0
    def parse(self, response: scrapy.http.Response):
        elements = json.loads(response.body_as_unicode())
        prepared_elements = [
            self.prepare_element(element_dict) for element_dict in elements
        ]

        collection_elements = self.prepare_collections(prepared_elements)

        for i, element_dict in enumerate(collection_elements):

            copyResponse = response.copy()

            # Passing the dictionary for easier access to attributes.
            copyResponse.meta["item"] = element_dict

            # In case JSON string representation is preferred:
            json_str = json.dumps(element_dict,
                                  indent=4,
                                  sort_keys=True,
                                  ensure_ascii=False)
            copyResponse._set_body(json_str)

            if self.hasChanged(copyResponse):
                yield self.handleEntry(copyResponse)

            # LomBase.parse() has to be called for every individual instance that needs to be saved to the database.
            LomBase.parse(self, copyResponse)
 def parseEntry(self, response):
     if self.get("language", response=response) == "de":
         return LomBase.parse(self, response)
     logging.info(
         "Skpping entry with language " + self.get("language", response=response)
     )
     return None
 def parse(self, response):
     rows = self.readCSV(
         csv.reader(StringIO(response.body.decode("UTF-8")), delimiter=","),
         2)
     for row in rows:
         copyResponse = response.copy()
         copyResponse.meta["row"] = row
         if self.getId(copyResponse):
             yield LomBase.parse(self, copyResponse)
 def parse(self, response):
     data = json.loads(response.body_as_unicode())
     if len(data["nodes"]) > 0:
         for item in data["nodes"]:
             copyResponse = response.replace(url=item["content"]["url"])
             copyResponse.meta["item"] = item
             if self.hasChanged(copyResponse):
                 yield LomBase.parse(self, copyResponse)
         yield self.search(data["pagination"]["from"] +
                           data["pagination"]["count"])
 def handleEntry(self, response):
     return LomBase.parse(self, response)
 def parse(self, response):
     return LomBase.parse(self, response)
Beispiel #9
0
 def parseRecord(self, response):
     lom = LomBase.parse(self, response)
     return lom
Beispiel #10
0
 def startHandler(self, response):
     for item in response.xpath('//rss/channel/item'):
         responseCopy = response.copy()
         responseCopy.meta['item'] = item
         yield LomBase.parse(self, response)
 def startHandler(self, response):
     for item in response.xpath("//rss/channel/item"):
         responseCopy = response.replace(url=item.xpath("link//text()").get())
         responseCopy.meta["item"] = item
         yield LomBase.parse(self, responseCopy)
 def parse(self, response):
   print(response.url)
   return LomBase.parse(self, response)
 def parseEntry(self, response):
   if self.get('language', response = response) == 'de':
     return LomBase.parse(self, response)
   logging.info('Skpping entry with language ' + self.get('language', response = response))
   return None
Beispiel #14
0
    def parse(self, response: scrapy.http.Response):
        print("Parsing URL: " + response.url)

        # Call Splash only once per page (that contains multiple XML elements).
        data = self.getUrlData(response.url)
        response.meta["rendered_data"] = data

        # We would use .fromstring(response.text) if the response did not include the XML declaration:
        # <?xml version="1.0" encoding="utf-8"?>
        root = etree.XML(response.body)
        tree = etree.ElementTree(root)

        # pbar works even with self.page > 0.
        if self.pbar is None:
            total_elements = int(tree.xpath('/root/sum')[0].text)
            remaining_elements = total_elements - self.page * self.limit
            self.pbar = tqdm(total=(remaining_elements),
                             desc=self.name + " downloading progress: ",
                             initial=self.page * self.limit)

        # If results are returned.
        elements = tree.xpath('/root/items/*')
        if len(elements) > 0:
            for element in elements:
                self.pbar.update(1)

                copyResponse = response.copy()
                element_xml_str = etree.tostring(element,
                                                 pretty_print=True,
                                                 encoding='unicode')
                element_dict = xmltodict.parse(element_xml_str)

                # TODO: It's probably a pointless attribute.
                #del element_dict["data"]["score"]

                # Passing the dictionary for easier access to attributes.
                copyResponse.meta['item'] = element_dict["data"]

                # In case JSON string representation is preferred:
                # copyResponse._set_body(json.dumps(copyResponse.meta['item'], indent=1, ensure_ascii=False))
                copyResponse._set_body(element_xml_str)

                if self.hasChanged(copyResponse):
                    yield self.handleEntry(copyResponse)

                # LomBase.parse() has to be called for every individual instance that needs to be saved to the database.
                LomBase.parse(self, copyResponse)

        # TODO: To not stress the Rest APIs.
        # time.sleep(0.1)

        # If the number of returned results is equal to the imposed limit, it means that there are more to be returned.
        if len(elements) == self.limit:
            self.page += 1
            url = self.apiUrl.replace('%start',
                                      str(self.page * self.limit)).replace(
                                          '%anzahl', str(self.limit))
            yield scrapy.Request(url=url,
                                 callback=self.parse,
                                 headers={
                                     'Accept': 'application/xml',
                                     'Content-Type': 'application/xml'
                                 })
Beispiel #15
0
 def handleLink(self, response):
     return LomBase.parse(self, response)
Beispiel #16
0
    def parse(self, response: scrapy.http.Response):
        print("Parsing URL: " + response.url)

        # Call Splash only once per page (that contains multiple XML elements).
        data = self.getUrlData(response.url)
        response.meta["rendered_data"] = data

        # We would use .fromstring(response.text) if the response did not include the XML declaration:
        # <?xml version="1.0" encoding="utf-8"?>
        root = etree.XML(response.body)
        tree = etree.ElementTree(root)

        # Get the total number of possible elements
        elements_total = int(tree.xpath('/root/sum')[0].text)

        # If results are returned.
        elements = tree.xpath("/root/items/*")
        if len(elements) > 0:
            for element in elements:
                copyResponse = response.copy()
                element_xml_str = etree.tostring(element,
                                                 pretty_print=True,
                                                 encoding="unicode")
                try:
                    element_dict = xmltodict.parse(element_xml_str)
                    element_dict = element_dict["data"]

                    # Preparing the values here helps for all following logic across the methods.
                    self.prepare_element(element_dict)

                    # If there is no available county (Kreis) code, then we do not want to deal with this element.
                    if not ("county_ids" in element_dict
                            and element_dict["county_ids"] is not None
                            and len(element_dict["county_ids"]) > 0):
                        continue

                    # TODO: It's probably a pointless attribute.
                    # del element_dict["data"]["score"]

                    # Passing the dictionary for easier access to attributes.
                    copyResponse.meta["item"] = element_dict

                    # In case JSON string representation is preferred:
                    # copyResponse._set_body(json.dumps(copyResponse.meta['item'], indent=1, ensure_ascii=False))
                    copyResponse._set_body(element_xml_str)

                    if self.hasChanged(copyResponse):
                        yield self.handleEntry(copyResponse)

                    # LomBase.parse() has to be called for every individual instance that needs to be saved to the database.
                    LomBase.parse(self, copyResponse)
                except Exception as e:
                    print("Issues with the element: " +
                          str(element_dict["id_local"]) if "id_local" in
                          element_dict else "")
                    print(str(e))

        current_expected_count = (self.page + 1) * self.limit

        # TODO: To not stress the Rest APIs.
        # time.sleep(0.1)

        # If we are below the total available numbers continue fetching more pages.
        if current_expected_count < elements_total:
            self.page += 1
            url = self.apiUrl.replace("%start",
                                      str(self.page * self.limit)).replace(
                                          "%anzahl", str(self.limit))
            yield scrapy.Request(
                url=url,
                callback=self.parse,
                headers={
                    "Accept": "application/xml",
                    "Content-Type": "application/xml",
                },
            )
Beispiel #17
0
 def parse(self, response):
   if not self.get('description', response = response):
     logging.info('skipping empty entry in serlo')
     return None
   return LomBase.parse(self, response)
 def handleEntry(self, response):
     response.meta["item"] = json.loads(response.body_as_unicode())
     return LomBase.parse(self, response)