def parse_video_page(self, response: scrapy.http.Response = None): """ parses a video-page (e.g. https://vimeo.com/videoID whereby videoID is a number) for metadata (condition: only if there is a "json+ld"-script found within the video-page). """ # XPath to description of a video looks like this: # //*[@id="main"]/div/main/div/div/div/div[2]/div[3]/div # if ld+json script-container doesn't exist, at least log the error if (response.xpath('//script[@type="application/ld+json"]').get(). strip()) is not None: # TODO: there's additional metadata inside a script block: window.vimeo.clip_page_config # - longer description - maybe use this one? # - duration (both in seconds and formatted) # - ads # - house_ads_enabled # - third_party_ads_enabled # response.xpath('//*[@id="wrap"]/div[2]/script[1]/text()').get() # might have to access it and split it up with regEx return LomBase.parse(self, response) else: logging.debug("Could not find ld+json script, skipping entry: " + response.url)
def getPermissions(self, response=None) -> PermissionItemLoader: permissions = LomBase.getPermissions(self, response) # TODO: PermissionItemLoader - which value should be set? permissions.add_value( 'public', self.settings.get("DEFAULT_PUBLIC_STATE")) # is this necessary? return permissions
def getLOMEducational(self, response=None) -> LomEducationalItemLoader: edu = LomBase.getLOMEducational(self, response) # TODO: which category does "schule im Aufbruch" fit into? double-check! edu.add_value( 'language', 'de' ) # okay to hardcode this? (some videos are bilingual, but meta # data from vimeo doesn't offer language attributes) return edu
def getLOMGeneral(self, response=None) -> LomGeneralItemloader: general = LomBase.getLOMGeneral(self, response) ld_json = self.get_ld_json(response) general.add_value('title', html.unescape(ld_json[0]["name"])) general.add_value('description', html.unescape(ld_json[0]["description"])) # TODO: set manually if there are no keywords given? # general.add_value('keyword', '') # manual keywords? return general
def getLOMLifecycle(self, response=None) -> LomLifecycleItemloader: lifecycle = LomBase.getLOMLifecycle(self, response) ld_json = self.get_ld_json(response) # author information is inside a dictionary with schema.org type Person # we could maybe grab the whole object instead? author_dict = ld_json[1]["itemListElement"][0]["item"] # TODO: LomLifeCycleItemLoader lifecycle.add_value('organization', author_dict["name"]) lifecycle.add_value('url', author_dict["@id"]) return lifecycle
def getBase(self, response=None) -> BaseItemLoader: base: BaseItemLoader = LomBase.getBase(self, response) ld_json = self.get_ld_json(response) current_url = str( response.url ) # making double-sure that we're using a string for sourceID base.add_value('sourceId', current_url) # maybe add sourceID + dateModified as hash? hash_temp: str = str(ld_json[0]["dateModified"] + self.version) base.add_value("hash", hash_temp) base.add_value("lastModified", ld_json[0]["dateModified"]) base.add_value('thumbnail', ld_json[0]["thumbnailUrl"]) return base
def getValuespaces(self, response) -> ValuespaceItemLoader: vs = LomBase.getValuespaces(self, response) # TODO: ValueSpaceItemLoader() missing keys? which ones are to be manually set? # - dataProtectionConformity # - fskRating # - oer # - educationalContext # - educationalContentType vs.add_value('conditionsOfAccess', 'no_login') vs.add_value('containsAdvertisement', 'yes') # set to yes because of vimeos own advertisements vs.add_value('price', 'no') vs.add_value('intendedEndUserRole', 'teacher') vs.add_value('learningResourceType', 'video') return vs
def getLOMTechnical(self, response=None) -> LomTechnicalItemLoader: # TODO: LomTechnicalItemLoader() technical = LomBase.getLOMTechnical(self, response) ld_json = self.get_ld_json(response) # TODO: Make sure that we're grabbing the right type for 'format' # if we were to acquire the format by an API call # (see https://developer.vimeo.com/api/reference/responses/video), vimeo would offer 3 options: # 'live' (for live events), # 'stock' (this video is a Vimeo Stock video) # 'video' (this video is a standard Vimeo video) # grabs the video type from the metadata header - most of the times it'll be video.other technical.add_value( 'format', response.xpath('/html/head/meta[18]/@content').get()) technical.add_value('location', ld_json[0]["url"]) technical.add_value('duration', ld_json[0]["duration"]) return technical
def getLicense(self, response=None) -> LicenseItemLoader: lic = LomBase.getLicense(self, response) license_url = self.get_license(response) lic.add_value('url', license_url) return lic
def __init__(self, **kwargs): LomBase.__init__(self, **kwargs)