Example #1
0
    def parse_video_page(self, response: scrapy.http.Response = None):
        """
        parses a video-page (e.g. https://vimeo.com/videoID whereby videoID is a number) for metadata
        (condition: only if there is a "json+ld"-script found within the video-page).

        """
        # XPath to description of a video looks like this:
        # //*[@id="main"]/div/main/div/div/div/div[2]/div[3]/div

        # if ld+json script-container doesn't exist, at least log the error
        if (response.xpath('//script[@type="application/ld+json"]').get().
                strip()) is not None:

            # TODO: there's additional metadata inside a script block: window.vimeo.clip_page_config
            #   - longer description - maybe use this one?
            #   - duration (both in seconds and formatted)
            #   - ads
            #       - house_ads_enabled
            #       - third_party_ads_enabled
            # response.xpath('//*[@id="wrap"]/div[2]/script[1]/text()').get()
            # might have to access it and split it up with regEx

            return LomBase.parse(self, response)
        else:
            logging.debug("Could not find ld+json script, skipping entry: " +
                          response.url)
Example #2
0
 def getPermissions(self, response=None) -> PermissionItemLoader:
     permissions = LomBase.getPermissions(self, response)
     # TODO: PermissionItemLoader - which value should be set?
     permissions.add_value(
         'public',
         self.settings.get("DEFAULT_PUBLIC_STATE"))  # is this necessary?
     return permissions
Example #3
0
 def getLOMEducational(self, response=None) -> LomEducationalItemLoader:
     edu = LomBase.getLOMEducational(self, response)
     # TODO: which category does "schule im Aufbruch" fit into? double-check!
     edu.add_value(
         'language', 'de'
     )  # okay to hardcode this? (some videos are bilingual, but meta
     # data from vimeo doesn't offer language attributes)
     return edu
Example #4
0
 def getLOMGeneral(self, response=None) -> LomGeneralItemloader:
     general = LomBase.getLOMGeneral(self, response)
     ld_json = self.get_ld_json(response)
     general.add_value('title', html.unescape(ld_json[0]["name"]))
     general.add_value('description',
                       html.unescape(ld_json[0]["description"]))
     # TODO: set manually if there are no keywords given?
     #  general.add_value('keyword', '')     # manual keywords?
     return general
Example #5
0
 def getLOMLifecycle(self, response=None) -> LomLifecycleItemloader:
     lifecycle = LomBase.getLOMLifecycle(self, response)
     ld_json = self.get_ld_json(response)
     # author information is inside a dictionary with schema.org type Person
     # we could maybe grab the whole object instead?
     author_dict = ld_json[1]["itemListElement"][0]["item"]
     # TODO: LomLifeCycleItemLoader
     lifecycle.add_value('organization', author_dict["name"])
     lifecycle.add_value('url', author_dict["@id"])
     return lifecycle
Example #6
0
 def getBase(self, response=None) -> BaseItemLoader:
     base: BaseItemLoader = LomBase.getBase(self, response)
     ld_json = self.get_ld_json(response)
     current_url = str(
         response.url
     )  # making double-sure that we're using a string for sourceID
     base.add_value('sourceId', current_url)
     # maybe add sourceID + dateModified as hash?
     hash_temp: str = str(ld_json[0]["dateModified"] + self.version)
     base.add_value("hash", hash_temp)
     base.add_value("lastModified", ld_json[0]["dateModified"])
     base.add_value('thumbnail', ld_json[0]["thumbnailUrl"])
     return base
Example #7
0
 def getValuespaces(self, response) -> ValuespaceItemLoader:
     vs = LomBase.getValuespaces(self, response)
     # TODO: ValueSpaceItemLoader() missing keys? which ones are to be manually set?
     #   - dataProtectionConformity
     #   - fskRating
     #   - oer
     #   - educationalContext
     #   - educationalContentType
     vs.add_value('conditionsOfAccess', 'no_login')
     vs.add_value('containsAdvertisement',
                  'yes')  # set to yes because of vimeos own advertisements
     vs.add_value('price', 'no')
     vs.add_value('intendedEndUserRole', 'teacher')
     vs.add_value('learningResourceType', 'video')
     return vs
Example #8
0
    def getLOMTechnical(self, response=None) -> LomTechnicalItemLoader:
        # TODO: LomTechnicalItemLoader()
        technical = LomBase.getLOMTechnical(self, response)
        ld_json = self.get_ld_json(response)

        # TODO: Make sure that we're grabbing the right type for 'format'
        # if we were to acquire the format by an API call
        # (see https://developer.vimeo.com/api/reference/responses/video), vimeo would offer 3 options:
        # 'live' (for live events),
        # 'stock' (this video is a Vimeo Stock video)
        # 'video' (this video is a standard Vimeo video)

        # grabs the video type from the metadata header - most of the times it'll be video.other
        technical.add_value(
            'format',
            response.xpath('/html/head/meta[18]/@content').get())
        technical.add_value('location', ld_json[0]["url"])
        technical.add_value('duration', ld_json[0]["duration"])
        return technical
Example #9
0
 def getLicense(self, response=None) -> LicenseItemLoader:
     lic = LomBase.getLicense(self, response)
     license_url = self.get_license(response)
     lic.add_value('url', license_url)
     return lic
Example #10
0
 def __init__(self, **kwargs):
     LomBase.__init__(self, **kwargs)