Beispiel #1
0
    def group_mediothek_elements(self, mediothek_elements):
        """
        Collection elements in Mediothek have no special element to represent them (a parent element). Therefore, we
        select one of them as the collection representative (parent element) and set some of its attributes accordingly.
        """
        mediothek_default_download_url = "https://www.schulportal-thueringen.de/web/guest/media/detail?tspi="

        mediothek_elements_grouped_by = self.group_by_elements(
            mediothek_elements, "mediumNummer")

        # Specifies a special case when a
        single_element_collection_serientitel = "Mediensammlungen zur freien Verwendung im Bildungsbereich"

        collection_elements = []

        edusharing = EduSharing()  # Used to generate UUIDs.

        # Generate new "parent" (representative) element.
        for group_by_key in sorted(mediothek_elements_grouped_by.keys()):
            group = mediothek_elements_grouped_by[group_by_key]
            parent_element = copy.deepcopy(group[0])

            # We need to assign a new ID, different from the previous ones. For this purpose, we decide to modify
            # the ID of the existing element and add some suffix to note that this is an artificial element.
            # Clearly, such a big number for an ID will have no collisions with existing real elements.
            artificial_element_suffix = "000000"
            parent_element[
                "id"] = parent_element["id"] + artificial_element_suffix

            parent_element[
                "downloadUrl"] = mediothek_default_download_url + str(
                    parent_element["mediumId"])

            parent_element["title"] = parent_element["einzeltitel"]

            parent_element["searchable"] = 1
            parent_element["aggregation_level"] = 2
            parent_element["uuid"] = edusharing.buildUUID(
                parent_element["downloadUrl"])

            for element in group:
                element["searchable"] = 0
                element["aggregation_level"] = 1
                element["uuid"] = edusharing.buildUUID(element["downloadUrl"])

                element["title"] = element["dateiBezeichnung"]

            # Add connections from parent to children elements.
            parent_element, group = self.relate_parent_with_children_elements(
                parent_element, group)

            collection_elements.append(parent_element)
            collection_elements.extend(group)

        return collection_elements
Beispiel #2
0
 def hasChanged(self, response=None) -> bool:
     if self.forceUpdate:
         return True
     if self.uuid:
         if self.getUUID(response) == self.uuid:
             logging.info("matching requested id: " + self.uuid)
             return True
         return False
     if self.remoteId:
         if str(self.getId(response)) == self.remoteId:
             logging.info("matching requested id: " + self.remoteId)
             return True
         return False
     db = EduSharing().findItem(self.getId(response), self)
     changed = db == None or db[1] != self.getHash(response)
     if not changed:
         logging.info("Item " + db[0] + " has not changed")
     return changed
Beispiel #3
0
    def process_item(self, raw_item, spider):
        item = ItemAdapter(raw_item)
        title = "<no title>"
        if "title" in item["lom"]["general"]:
            title = str(item["lom"]["general"]["title"])
        entryUUID = EduSharing.buildUUID(item["response"]["url"] if "url" in
                                         item["response"] else item["hash"])
        self.insertItem(spider, entryUUID, item)
        logging.info("item " + entryUUID + " inserted/updated")

        # @TODO: We may need to handle Collections
        # if 'collection' in item:
        #    for collection in item['collection']:
        # if dbItem:
        #     entryUUID = dbItem[0]
        #     logging.info('Updating item ' + title + ' (' + entryUUID + ')')
        #     self.curr.execute("""UPDATE "references_metadata" SET last_seen = now(), last_updated = now(), hash = %s, data = %s WHERE source = %s AND source_id = %s""", (
        #         item['hash'], # hash
        #         json,
        #         spider.name,
        #         str(item['sourceId']),
        #     ))
        # else:
        #     entryUUID = self.buildUUID(item['response']['url'])
        #     if 'uuid' in item:
        #         entryUUID = item['uuid']
        #     logging.info('Creating item ' + title + ' (' + entryUUID + ')')
        #     if self.uuidExists(entryUUID):
        #         logging.warn('Possible duplicate detected for ' + entryUUID)
        #     else:
        #         self.curr.execute("""INSERT INTO "references" VALUES (%s,true,now())""", (
        #             entryUUID,
        #         ))
        #     self.curr.execute("""INSERT INTO "references_metadata" VALUES (%s,%s,%s,%s,now(),now(),%s)""", (
        #         spider.name, # source name
        #         str(item['sourceId']), # source item identifier
        #         entryUUID,
        #         item['hash'], # hash
        #         json,
        #     ))
        return raw_item
Beispiel #4
0
 def getUUID(self, response=None) -> str:
     return EduSharing().buildUUID(self.getUri(response))
Beispiel #5
0
    def group_pixiothek_elements(self, pixiothek_elements, mediothek_elements):
        """
        Collection elements in Pixiothek have a "parent" (representative) Mediothek element that describes the whole
        collection. Our task in this method is for every Pixiothek group to find its Mediothek element and add the
        connections between it and the Pixiothek elements. These Mediothek elements will not be considered as children
        of Mediothek collections.

        If we cannot find such a "parent" element among the Mediothek elements, then we select one of them as the
        collection parent (representative element) and set some of its attributes accordingly.
        """

        default_download_url = "https://www.schulportal-thueringen.de/html/images/" \
                               "themes/tsp2/startseite/banner_phone_startseite.jpg?id="

        mediothek_default_download_url = "https://www.schulportal-thueringen.de/web/guest/media/detail?tspi="

        pixiothek_elements_grouped_by = self.group_by_elements(
            pixiothek_elements, "serientitel")

        # Group Mediothek elements by einzeltitel. We are going to use this dictionary in the following loop to find
        # Pixiothek items that have this value in their serientitel.
        mediothek_elements_grouped_by_einzeltitel = self.group_by_elements(
            mediothek_elements, "einzeltitel")

        single_element_collection_serientitel = "Mediensammlungen zur freien Verwendung im Bildungsbereich"

        collection_elements = []

        edusharing = EduSharing()

        # Keeping track of "parent" (representative) elements to remove them from the Mediothek elements.
        parent_mediothek_elements = set()

        # Generate new "representative" (parent) element.
        for group_by_key in sorted(pixiothek_elements_grouped_by.keys()):
            group = pixiothek_elements_grouped_by[group_by_key]
            serientitel = None
            if "serientitel" in group[0]:
                serientitel = group[0]["serientitel"]

            # If a single Mediothek element exists with the same einzeltitel as this group's serientitel, then we shall use it
            # as the parent element of this collection.
            if serientitel in mediothek_elements_grouped_by_einzeltitel and \
                len(mediothek_elements_grouped_by_einzeltitel[serientitel]) == 1 and \
                mediothek_elements_grouped_by_einzeltitel[serientitel][0]["id"] not in parent_mediothek_elements: # Is not used as a parent of another collection.

                parent_element = copy.deepcopy(
                    mediothek_elements_grouped_by_einzeltitel[serientitel][0])
                parent_mediothek_elements.add(parent_element["id"])
                parent_element["title"] = parent_element["einzeltitel"]
                parent_element[
                    "downloadUrl"] = mediothek_default_download_url + str(
                        parent_element["mediumId"])

                # If the found Mediothek element has a serientitel equal to a predefined value, which indicates that
                # this is a collection item (which should normally be a parent and not a single element), we treat
                # specially and set the title equal to the einzeltitel, which already describes the collection.
                if parent_element[
                        "serientitel"] == single_element_collection_serientitel:
                    group.append(
                        copy.deepcopy(
                            mediothek_elements_grouped_by_einzeltitel[
                                serientitel][0]))

            # Else, we shall use any random element of this group as the parent element.
            else:
                parent_element = copy.deepcopy(group[0])

                # We need to assign a new ID, different from the previous ones. For this purpose, we decide to modify
                # the ID of the existing element and add some suffix to note that this is an artificial element.
                # Clearly, such a big number for an ID will have no collisions with existing real elements.
                artificial_element_suffix = "000000"
                parent_element[
                    "id"] = parent_element["id"] + artificial_element_suffix

                # Assign a fake URL that we can still recognize if we ever want to allow the access of the collection
                # content.
                parent_element[
                    "downloadUrl"] = default_download_url + parent_element["id"]
                parent_element["title"] = parent_element["serientitel"]

            parent_element["searchable"] = 1
            parent_element["aggregation_level"] = 2
            parent_element["uuid"] = edusharing.buildUUID(
                parent_element["downloadUrl"])

            for element in group:
                element["searchable"] = 0
                element["aggregation_level"] = 1
                element["uuid"] = edusharing.buildUUID(element["downloadUrl"])

                element["title"] = element["dateiBezeichnung"]

            # Add connections from parent to children elements.
            parent_element, group = self.relate_parent_with_children_elements(
                parent_element, group)

            collection_elements.append(parent_element)
            collection_elements.extend(group)

        # Remove Mediothek elements which were used as parents. We go in reverse mode as only then the indices keep
        # making sense as we keep deleting elements. The other way around, every time you delete an element the
        # consequent indices are not valid anymore.
        for i in reversed(range(len(mediothek_elements))):
            if mediothek_elements[i]["id"] in parent_mediothek_elements:
                del (mediothek_elements[i])

        return collection_elements, mediothek_elements