def parse(self, response: scrapy.http.Response, **kwargs): """ Scrapy Contracts: @url http://ginkgomaps.com/landkarten_deutschland.html @returns items 1 """ # making sure that the current url is marked as parsed: self.debug_parsed_urls.add(response.url) # IMPORTANT: modern browsers add "tbody"-elements into tables, scrapy doesn't see those tags! # Remember: whatever request you see with the developer tools in your browser, you need to manually remove # ANY <tbody>-tag that sits inside your xpath expression, otherwise it will return an empty [] ! # response.xpath('/html/body/center/table[1]/tr[4]/td[3]/table[1]').get() # first index page contains 42 maps, all inside tables of the class "smalltable": # response.xpath('//table[@class="smalltable"]') table_body = response.xpath('//table[@class="smalltable"]') description_temp = str() first_thumbnail = str() if table_body is not None: for table_item in table_body: # print(table_item.get()) map_title = table_item.xpath('tr/td[1]/a[2]/text()').get() map_design_heading = table_item.xpath( 'tr/td[2]/u[1]/text()').get() map_design = table_item.xpath('tr/td[2]/p[1]/text()').get() map_content_heading = table_item.xpath( 'tr/td[2]/u[2]/text()').get() map_content = table_item.xpath('tr/td[2]/p[2]/text()').get() # map_thumbnail = response.urljoin(table_item.xpath('tr/td[1]/a[1]/img/@src').get()) # map_thumbnail_description = table_item.xpath('tr/td[1]/a[1]/img/@alt').get() # pdf_download_url = response.urljoin(table_item.xpath('tr/td[2]/p[3]/a[1]/@href').get()) # pdf_download_title = table_item.xpath('tr/td[2]/p[3]/a[2]/text()').get() # jpeg_download_medium_url = response.urljoin(table_item.xpath('tr/td[2]/p[4]/a[2]/@href').get()) # jpeg_download_medium_description = table_item.xpath('tr/td[2]/p[4]/a[2]/text()').get() # jpeg_download_high_url = response.urljoin(table_item.xpath('tr/td[2]/p[5]/a[2]/@href').get()) # jpeg_download_high_description = table_item.xpath('tr/td[2]/p[5]/a[2]/text()').get() description_temp += map_title + "\n" \ + map_design_heading + map_design \ + map_content_heading + map_content # while we could theoretically grab all thumbnails during the above loop, # the first one is enough for a preview-image in edu-sharing first_thumbnail = response.urljoin( table_body[0].xpath('tr/td[1]/a[1]/img/@src').get()) description_temp = w3lib.html.strip_html5_whitespace(description_temp) base = super().getBase(response=response) base.add_value('sourceId', response.url) last_modified = response.xpath('/html/head/meta[6]/@content').get() hash_temp = last_modified + self.version base.add_value('hash', hash_temp) base.add_value('type', Constants.TYPE_MATERIAL) if first_thumbnail is not None: base.add_value('thumbnail', first_thumbnail) base.add_value('lastModified', last_modified) lom = LomBaseItemloader() general = LomGeneralItemloader(response=response) general.add_value('language', 'de') general.add_value('identifier', response.url) # the description could be extended with additional infos about the map-formats and their resolutions, # (if necessary) general.add_value('description', description_temp) general.add_value('title', response.xpath('/html/head/title/text()').get()) # keywords are stored inside a String, separated by commas with (sometimes multiple) whitespaces, # therefore RegEx is needed to provide a list with individual keywords since a String.split() isn't enough: keyword_string = response.xpath( '/html/head/meta[@name="keywords"]/@content').get() kw_regex_split = re.split(r'\s*,\s*', keyword_string) general.add_value('keyword', kw_regex_split) lom.add_value('general', general.load_item()) technical = LomTechnicalItemLoader() technical.add_value('format', 'text/html') technical.add_value('location', response.url) lom.add_value('technical', technical.load_item()) lifecycle = LomLifecycleItemloader() lifecycle.add_value('date', last_modified) lifecycle.add_value('role', 'author') lifecycle.add_value('firstName', 'Dirk') lifecycle.add_value('lastName', 'Benkert') lifecycle.add_value('organization', 'Ginkgomaps') lifecycle.add_value('url', 'https://dirkbenkert.com/') lom.add_value('lifecycle', lifecycle.load_item()) educational = LomEducationalItemLoader() # since the learning objects are maps, expositive seems to be the best fit for interactivityType: educational.add_value('interactivityType', 'expositive') lom.add_value('educational', educational.load_item()) base.add_value('lom', lom.load_item()) vs = ValuespaceItemLoader() # since no educationalContext is given, either hardcode these values or don't use them at all # vs.add_value('educationalContext', ["Sekundarstufe I", # "Sekundarstufe II", # "Berufliche Bildung", # "Erwachsenenbildung"]) vs.add_value('intendedEndUserRole', ["learner", "teacher", "parent"]) vs.add_value('discipline', 'Geografie') # Geografie vs.add_value('learningResourceType', 'map') # Karte vs.add_value('conditionsOfAccess', 'no login') lic = LicenseItemLoader() # if needed, the license description could also be gathered and constructed from multiple tags within a # container: /html/body/center/table[1]/tbody/tr[5]/td[2]/p license_url: str = response.xpath( '/html/body/center/table[1]/tr[5]/td[2]/p/a/@href').get() if (license_url is not None) and (license_url.endswith("deed.de")): license_url = license_url[:-len("deed.de")] license_url = license_url.replace("http://", "https://") lic.add_value('url', license_url) lic.add_value('author', response.xpath('/html/head/meta[3]/@content').get()) base.add_value('valuespaces', vs.load_item()) base.add_value('license', lic.load_item()) permissions = super().getPermissions(response) base.add_value('permissions', permissions.load_item()) base.add_value('response', super().mapResponse(response).load_item()) yield base.load_item()
def parse(self, response: scrapy.http.Response, **kwargs): base = super().getBase(response=response) # there are no suitable images to serve as thumbnails, therefore SPLASH will have to do base.add_value('type', Constants.TYPE_MATERIAL) lom = LomBaseItemloader() general = LomGeneralItemloader(response=response) # description_raw = response.xpath('/html/body/table/tr[4]/td/table/tr/td').get() description_raw = response.xpath( '//descendant::td[@class="t1fbs"]').getall() description_raw: str = ''.join(description_raw) if description_raw is not None: description_raw = w3lib.html.remove_tags(description_raw) description_raw = w3lib.html.strip_html5_whitespace( description_raw) clean_description = w3lib.html.replace_escape_chars( description_raw) general.add_value('description', clean_description) if len(description_raw) == 0: # Fallback for exercise-pages where there's only 1 title field and 1 short instruction sentence # e.g.: http://www.zum.de/dwu/depothp/hp-phys/hppme24.htm description_fallback = response.xpath( '//descendant::div[@id="InstructionsDiv"]/descendant' '::*/text()').get() general.replace_value('description', description_fallback) # most of the time the title is stored directly title: str = response.xpath('/html/head/title/text()').get() if title.startswith("Dieses Info-Fenster"): # some subpages carry "Dieses Info-Fenster bleibt bis zum Schließen im Vordergrund" as their title, # therefore we need to grab the title from a better suited element. # This also means that the "description" is most probably wrong and needs a replacement as well: title = response.xpath('//td[@class="tt1math"]/text()').get() title = title.strip() # desc_list = response.xpath('/html/body/table[2]/tr/td/table/tr[1]/td[1]/text()').getall() desc_list = response.xpath('//td[@class="t1fbs"]/text()').getall() if desc_list is not None and len(desc_list) == 0: # if the first attempt at grabbing a description fails, we try it at another place desc_list = response.xpath('//td[@class="sg12"]/text()').get() if desc_list is not None: description_raw = ''.join(desc_list) # if there's multiple whitespaces within the description, replace them by a single whitespace: description_raw = re.sub(' +', ' ', description_raw) clean_description = w3lib.html.replace_escape_chars( description_raw) general.replace_value('description', clean_description) if title is not None: title = w3lib.html.replace_escape_chars(title) if title is not None: # this double-check is necessary for broken headings that ONLY consisted of escape-chars if title == '': # there's some pages (Exercises) that only hold escape chars or whitespaces as their title # the title is simply bold text hidden within a div container title = response.xpath( '//div[@class="Titles"]/h3[@class="ExerciseSubtitle"]/b/text()' ).get() title = title.strip() # Since we're grabbing titles from headings, a lot of them have a trailing ":" if len(title) > 0 and title.endswith(":"): # replacing the string with itself right up to the point of the colon title = title[:-1] general.add_value('title', title) general.add_value('identifier', response.url) general.add_value('language', 'de') # on the vast majority of .htm pages the keywords sit in the http-equiv content tag keyword_string = response.xpath( '/html/head/meta[@http-equiv="keywords"]/@content').get() if keyword_string is None: # but on some sub-pages, especially the interactive javascript pages, the keywords are in another container keyword_string = response.xpath( '/html/head/meta[@name="keywords"]/@content').get() if keyword_string is not None: keyword_list = keyword_string.rsplit(", ") # trying to catch the completely broken keyword strings to clean them up manually # e.g. at http://www.zum.de/dwu/depothp/hp-math/hpmz21.htm check XPath: /html/head/meta[2] kw_set = set() if keyword_list[0].endswith(","): # broken keyword list detected, now we have to manually clean the string up broken_keyword_string: str = response.xpath( '//meta[@name="keywords"]').get() broken_keyword_list = broken_keyword_string.replace('<meta name="keywords" content=', "") \ .replace(">", "").replace('"', "").replace(",", "").replace("=", "").split(" ") for item in broken_keyword_list: kw_set.add(item.strip()) if len(kw_set) == 0: # if there was no broken keyword meta field found, this condition always triggers kw_set = set(keyword_list) # checking if the keywords appear on the set of unwanted keywords, if they do, throw them away and only # keep the valid ones kw_set.difference_update(self.keywords_to_ignore) # once this check is done, add the keywords from the (cleaned up) keyword set keyword_list = list(kw_set) general.add_value('keyword', keyword_list) lom.add_value('general', general.load_item()) technical = LomTechnicalItemLoader() technical.add_value('format', 'text/html') technical.add_value('location', response.url) lom.add_value('technical', technical.load_item()) lifecycle = LomLifecycleItemloader() lifecycle.add_value('role', 'author') lifecycle.add_value('firstName', 'Dieter') lifecycle.add_value('lastName', 'Welz') lifecycle.add_value('url', '*****@*****.**') lifecycle.add_value( 'organization', response.xpath( '/html/head/meta[@http-equiv="organization"]/@content').get()) lom.add_value('lifecycle', lifecycle.load_item()) educational = LomEducationalItemLoader() lom.add_value('educational', educational.load_item()) base.add_value('lom', lom.load_item()) vs = ValuespaceItemLoader() # since the website holds both mathematics- and physics-related materials, we need to take a look at the last # section of the url: .htm filenames that start with # m | hpm | tkm belong to the discipline mathematics # p | kwp | hpp belong to the discipline physics url_last_part = response.url url_last_part = url_last_part.split('/')[-1] if url_last_part.startswith("m") or url_last_part.startswith( "hpm") or url_last_part.startswith("tkm"): vs.add_value('discipline', 'Mathematics') if url_last_part.startswith("p") or url_last_part.startswith("kwp") or url_last_part.startswith("hpp") \ or url_last_part.startswith("vcp"): vs.add_value('discipline', "Physics") vs.add_value('learningResourceType', Constants.TYPE_MATERIAL) vs.add_value('intendedEndUserRole', [ 'learner', 'teacher', 'parent', ]) vs.add_value('price', 'no') vs.add_value('conditionsOfAccess', 'no login') lic = LicenseItemLoader() lic.add_value('description', 'http://www.zum.de/dwu/hilfe.htm') lic.add_value('internal', Constants.LICENSE_CUSTOM) lic.add_value( 'author', response.xpath( '/html/head/meta[@http-equiv="author"]/@content').get()) base.add_value('valuespaces', vs.load_item()) base.add_value('license', lic.load_item()) permissions = super().getPermissions(response) base.add_value('permissions', permissions.load_item()) base.add_value('response', super().mapResponse(response).load_item()) # print(self.parsed_urls) # print("debug_url_set length:", len(self.parsed_urls)) yield base.load_item()
def parse(self, response: scrapy.http.Response, **kwargs) -> BaseItemLoader: """ Scrapy Contracts: @url https://kmap.eu/app/browser/Mathematik/Exponentialfunktionen/Asymptoten @returns item 1 """ last_modified = kwargs.get("lastModified") url_data_splash_dict = WebTools.getUrlData(response.url, engine=WebEngine.Pyppeteer) splash_html_string = url_data_splash_dict.get('html') json_ld_string: str = Selector(text=splash_html_string).xpath('//*[@id="ld"]/text()').get() json_ld: dict = json.loads(json_ld_string) # TODO: skip item method - (skips item if it's an empty knowledge map) base = BaseItemLoader() base.add_value('sourceId', response.url) hash_temp = json_ld.get("mainEntity").get("datePublished") hash_temp += self.version base.add_value('hash', hash_temp) base.add_value('lastModified', last_modified) base.add_value('type', Constants.TYPE_MATERIAL) # Thumbnails have their own url path, which can be found in the json+ld: # "thumbnailUrl": "/snappy/Physik/Grundlagen/Potenzschreibweise" # e.g. for the item https://kmap.eu/app/browser/Physik/Grundlagen/Potenzschreibweise # the thumbnail can be found at https://kmap.eu/snappy/Physik/Grundlagen/Potenzschreibweise thumbnail_path = json_ld.get("mainEntity").get("thumbnailUrl") if thumbnail_path is not None: base.add_value('thumbnail', 'https://kmap.eu' + thumbnail_path) lom = LomBaseItemloader() general = LomGeneralItemloader() general.add_value('identifier', json_ld.get("mainEntity").get("mainEntityOfPage")) keywords_string: str = json_ld.get("mainEntity").get("keywords") keyword_list = keywords_string.rsplit(", ") general.add_value('keyword', keyword_list) general.add_value('title', json_ld.get("mainEntity").get("name")) general.add_value('description', json_ld.get("mainEntity").get("description")) general.add_value('language', json_ld.get("mainEntity").get("inLanguage")) lom.add_value('general', general.load_item()) technical = LomTechnicalItemLoader() technical.add_value('format', 'text/html') technical.add_value('location', response.url) lom.add_value('technical', technical.load_item()) lifecycle = LomLifecycleItemloader() lifecycle.add_value('role', 'publisher') lifecycle.add_value('organization', json_ld.get("mainEntity").get("publisher").get("name")) author_email = json_ld.get("mainEntity").get("publisher").get("email") if author_email is not None: lifecycle.add_value('email', author_email) lifecycle.add_value('url', 'https://kmap.eu/') lifecycle.add_value('date', json_ld.get("mainEntity").get("datePublished")) lom.add_value('lifecycle', lifecycle.load_item()) educational = LomEducationalItemLoader() lom.add_value('educational', educational.load_item()) base.add_value('lom', lom.load_item()) vs = ValuespaceItemLoader() vs.add_value('discipline', json_ld.get("mainEntity").get("about")) vs.add_value('intendedEndUserRole', json_ld.get("mainEntity").get("audience")) vs.add_value('learningResourceType', json_ld.get("mainEntity").get("learningResourceType")) vs.add_value('price', 'no') vs.add_value('conditionsOfAccess', 'login required for additional features') base.add_value('valuespaces', vs.load_item()) lic = LicenseItemLoader() lic.add_value('author', json_ld.get("mainEntity").get("author").get("name")) lic.add_value('url', json_ld.get("mainEntity").get("license")) base.add_value('license', lic.load_item()) permissions = super().getPermissions(response) base.add_value("permissions", permissions.load_item()) base.add_value('response', super().mapResponse(response).load_item()) return base.load_item()
def parse(self, response: scrapy.http.Response, **kwargs): """ Populates a BaseItemLoader with metadata and yields the BaseItem afterwards. Scrapy Contracts: @url https://www.walter-fendt.de/html5/mde/pythagoras2_de.htm @returns items 1 """ # fetching publication date and lastModified from dynamically loaded <p class="Ende">-element: url_data_splash_dict = WebTools.getUrlData(response.url, engine=WebEngine.Pyppeteer) splash_html_string = url_data_splash_dict.get('html') page_end_element = Selector( text=splash_html_string).xpath('//p[@class="Ende"]').get() line_regex = re.compile(r'<br>') page_end_string = line_regex.split(page_end_element) published_date = None last_modified = None # the two strings inside the <p>-Container will look like this: # Walter Fendt, 2. November 2000 # Letzte Änderung: 17. Oktober 2017 # therefore we'll need to extract the dates by splitting up the strings for temp_string in page_end_string: if temp_string.startswith("Walter Fendt"): sentence1 = temp_string.rsplit(', ') # each "sentence" list now holds exactly 2 elements, whereby the last element should be the date for item in sentence1: if dateparser.parse(item) is not None: published_date = dateparser.parse(item) if temp_string.startswith('Letzte Änderung:'): sentence2 = temp_string.rsplit(': ') for item2 in sentence2: if dateparser.parse(item2) is not None: last_modified = dateparser.parse(item2) base = super().getBase(response=response) base.add_value('type', Constants.TYPE_MATERIAL) if last_modified is not None: hash_temp = last_modified.isoformat() + self.version base.add_value('hash', hash_temp) base.add_value('lastModified', last_modified.isoformat()) base.add_value('sourceId', response.url) lom = LomBaseItemloader() general = LomGeneralItemloader(response=response) general.add_value('identifier', response.url) general.add_value('title', response.xpath('/html/head/title/text()').get()) general.add_value( 'description', response.xpath( '/html/head/meta[@name="description"]/@content').get()) keywords_string: str = response.xpath( '/html/head/meta[@name="keywords"]/@content').get() if keywords_string is not None: keyword_list = keywords_string.rsplit(", ") general.add_value('keyword', keyword_list) general.add_value('language', 'de') lom.add_value('general', general.load_item()) technical = LomTechnicalItemLoader() technical.add_value('format', "text/html") technical.add_value('location', response.url) lom.add_value('technical', technical.load_item()) lifecycle = LomLifecycleItemloader() lifecycle.add_value('role', 'author') lifecycle.add_value('firstName', 'Walter') lifecycle.add_value('lastName', 'Fendt') lifecycle.add_value( 'url', "https://www.walter-fendt.de/wf.htm") # author information if published_date is not None: lifecycle.add_value('date', published_date.isoformat()) lom.add_value('lifecycle', lifecycle.load_item()) educational = LomEducationalItemLoader() educational.add_value('interactivityType', 'mixed') lom.add_value('educational', educational.load_item()) base.add_value('lom', lom.load_item()) vs = ValuespaceItemLoader() vs.add_value('conditionsOfAccess', 'no login') vs.add_value('discipline', 'Mathematik') vs.add_value('intendedEndUserRole', ['learner', 'teacher', 'parent']) vs.add_value('learningResourceType', ['application', 'web page']) vs.add_value('price', 'no') base.add_value('valuespaces', vs.load_item()) lic = LicenseItemLoader() lic.add_value('author', 'Walter Fendt') # if scrapy could render the <p class="Ende">-element, the license url could be found with the following XPath: # license_url = response.xpath('//p[@class="Ende"]/a[@rel="license"]/@href') # but since scrapy can't "see" this container, we're extracting the information with scrapy-splash license_url: str = Selector(text=splash_html_string).xpath( '//p[@class="Ende"]/a[@rel="license"]/@href').get() if license_url is not None: if license_url.startswith("http://"): license_url = license_url.replace("http://", "https://") # the license url links to the /de/ version, which currently doesn't get mapped properly # "https://creativecommons.org/licenses/by-nc-sa/3.0/de/" # -> 'https://creativecommons.org/licenses/by-nc-sa/3.0/' is the url-format we want if "creativecommons.org/licenses/" in license_url and license_url.endswith( "/de/"): license_url = license_url.split("de/")[0] lic.add_value('url', license_url) base.add_value('license', lic.load_item()) permissions = super().getPermissions(response) base.add_value('permissions', permissions.load_item()) # TODO: fix super().mapResponse base.add_value('response', super().mapResponse(response).load_item()) yield base.load_item()
def get_metadata_from_review_url(self, response: scrapy.http.Response, **kwargs): """ grabs metadata from the "material_review_url"-page and uses the wp_json_item from the "parse_page"-method to return a BaseItemLoader with the combined metadata from both sources. :param response: the scrapy.http.Response object for the currently parsed page :param kwargs: wp_json_item-dictionary """ # logging.debug("DEBUG inside get_metadata_from_review_url: wp_json_item id", kwargs.get("id")) wp_json_item = kwargs.get("item") # logging.debug("DEBUG inside get_metadata_from_review_url: response type = ", type(response), # "url =", response.url) ld_json_string = response.xpath('/html/head/script[@type="application/ld+json"]/text()').get().strip() ld_json_string = html.unescape(ld_json_string) ld_json = json.loads(ld_json_string) hash_temp: Optional[str] = None language_temp: Optional[str] = None pub_date: Optional[str] = None organization_id: Optional[str] = None organization_name: Optional[str] = None date_modified: Optional[str] = None # this is a workaround to make sure that we actually grab the following data, # no matter where they are positioned in the list: # - dateModified # - inLanguage # - datePublished # - organization_name and url # e.g.: since there seems to be fluctuation how many elements the "@graph"-Array holds, we can't be sure # which position "dateModified" actually has: # sometimes it's ld_json.get("@graph")[2], sometimes on [3] etc., therefore we must check all of them ld_graph_items = ld_json.get("@graph") for item in ld_graph_items: if item.get("dateModified") is not None: date_modified = item.get("dateModified") # this can be used instead of 'date' in lastModified hash_temp = item.get("dateModified") + self.version if item.get("@type") == "WebSite": language_temp = item.get("inLanguage") if item.get("@type") == "WebPage": pub_date = item.get("datePublished") if item.get("@type") == "Organization": organization_id = item.get("@id") organization_name = item.get("name") base = BaseItemLoader() base.add_value("sourceId", response.url) base.add_value("hash", hash_temp) # base.add_value("response", super().mapResponse(response).load_item()) base.add_value("type", Constants.TYPE_MATERIAL) # TODO: is this correct? use mapping for edu-context? base.add_value("thumbnail", wp_json_item.get("material_screenshot")) # base.add_value("lastModified", wp_json_item.get("date")) # is "date" from wp_json for lastModified correct? base.add_value("lastModified", date_modified) # or is this one better (grabbed from from material_review_url)? lom = LomBaseItemloader() general = LomGeneralItemloader(response=response) general.add_value("title", wp_json_item.get("material_titel")) # the source material heavily fluctuates between perfectly fine strings and messy (hardcoded) html tags # as well as "\n" and "\t", therefore we need to clean up that String first: raw_description = wp_json_item.get("material_beschreibung") raw_description = w3lib.html.remove_tags(raw_description) raw_description = w3lib.html.strip_html5_whitespace(raw_description) clean_description = w3lib.html.replace_escape_chars(raw_description) general.add_value("description", clean_description) general.add_value("identifier", wp_json_item.get("id")) if language_temp is not None: general.add_value("language", language_temp) kw_temp = list() for item in wp_json_item.get("material_schlagworte"): kw_temp.append(item.get("name")) general.add_value("keyword", kw_temp) lom.add_value("general", general.load_item()) technical = LomTechnicalItemLoader() technical.add_value("format", "text/html") technical.add_value("location", wp_json_item.get("material_review_url")) lom.add_value("technical", technical.load_item()) lifecycle = LomLifecycleItemloader() if organization_name is not None: lifecycle.add_value("organization", organization_name) if organization_id is not None: lifecycle.add_value("url", organization_id) if pub_date is not None: lifecycle.add_value("date", pub_date) lom.add_value("lifecycle", lifecycle.load_item()) educational = LomEducationalItemLoader() if wp_json_item.get("material_altersstufe") is not None: # age range is returned as a list of <from_age>-<to_age>-Strings, possible return values are: # e.g. "01-05", "05-10", "10-13", "13-15", "15-19" and "18-99" age_regex = re.compile(r'(\d{1,2})-(\d{1,2})') age_range = set() age_range_item_loader = LomAgeRangeItemLoader() for item in wp_json_item.get("material_altersstufe"): age_range_temp = item.get("name") age_from = str(age_regex.search(age_range_temp).group(1)) age_to = str(age_regex.search(age_range_temp).group(2)) age_range.add(age_from) age_range.add(age_to) # print("FINAL AGE_RANGE: min = ", min(age_range), " max = ", max(age_range)) if len(age_range) != 0: age_range_item_loader.add_value("fromRange", min(age_range)) age_range_item_loader.add_value("toRange", max(age_range)) educational.add_value("typicalAgeRange", age_range_item_loader.load_item()) lom.add_value("educational", educational.load_item()) base.add_value("lom", lom.load_item()) vs = ValuespaceItemLoader() vs.add_value("discipline", "http://w3id.org/openeduhub/vocabs/discipline/520") # Religion # mapping educationalContext educational_context = list() for edu_con_item in wp_json_item.get("material_bildungsstufe"): educational_context.append(edu_con_item.get("name")) for edu_item in educational_context: if edu_item in self.mapping_edu_context.keys(): edu_item = self.mapping_edu_context.get(edu_item) if edu_item != "": vs.add_value("educationalContext", edu_item) # using mapped media_type_list for valuespaces -> learningResourceType media_type_list = list() for item in wp_json_item.get("material_medientyp"): media_type_list.append(item.get("name")) for media_type_item in media_type_list: if media_type_item in self.mapping_media_types.keys(): media_type_item = self.mapping_media_types.get(media_type_item) if media_type_item != "": vs.add_value("learningResourceType", media_type_item) # see: https://vocabs.openeduhub.de/w3id.org/openeduhub/vocabs/learningResourceType/index.html # there's metadata for "Kompetenzen" (e.g.: "Deuten", "Gestalten", "Reflexion") within the returned wp_json # that our data-model doesn't support yet. for future reference though: # wp_json_item.get("material_kompetenzen") -> list vs.add_value("intendedEndUserRole", "teacher") lic = LicenseItemLoader() license_regex_nc_reuse = re.compile(r'Zur nicht kommerziellen Wiederverwendung gekennzeichnet') license_regex_nc_reuse_and_change = re.compile( r'Zur nicht kommerziellen Wiederverwendung und Veränderung gekennzeichnet') # important clarification from rpi-virtuell: # 'frei zugänglich' describes 'ungeklärte Lizenz' / 'volles Urheberrecht' # CC licenses > 'frei zugänglich' if both values are found in the license description license_regex_free_access = re.compile(r'frei zugänglich') license_regex_free_after_signup = re.compile(r'kostenfrei nach Anmeldung') license_regex_with_costs = re.compile(r'kostenpflichtig') license_description = response.xpath('//div[@class="material-detail-meta-access material-meta"]' '/div[@class="material-meta-content-entry"]/text()').get() if license_description is not None: license_description = html.unescape(license_description.strip()) lic.add_value("description", license_description) cc_by_nc_nd = license_regex_nc_reuse.search(license_description) cc_by_nc_sa = license_regex_nc_reuse_and_change.search(license_description) # if the RegEx search finds something, it returns a match-object. otherwise by default it returns None if cc_by_nc_nd is not None: lic.add_value("url", Constants.LICENSE_CC_BY_NC_ND_40) if cc_by_nc_sa is not None: lic.add_value("url", Constants.LICENSE_CC_BY_NC_SA_30) # if a material is "frei zugänglich", set price to none, but don't override a previously set CC-license if license_regex_free_access.search(license_description) is not None: vs.add_value("price", "no") # only if "frei zugänglich" is the only license-description this will trigger: # see https://rpi-virtuell.de/nutzungsbedingungen/ (5.) if license_regex_free_access.match(license_description) is not None: lic.add_value("url", Constants.LICENSE_CC_BY_SA_40) if license_regex_with_costs.search(license_description): lic.add_value("internal", Constants.LICENSE_COPYRIGHT_LAW) vs.add_value("price", "yes") if license_regex_free_after_signup.search(license_description): vs.add_value("price", "yes") vs.add_value("conditionsOfAccess", "login") else: # by default, all materials should be CC_BY_SA - according to the rpi-virtuell ToS lic.replace_value("url", Constants.LICENSE_CC_BY_SA_40) authors = list() # the author should end up in LOM lifecycle, but the returned metadata are too messily formatted to parse them # by easy patterns like (first name) + (last name) for item in wp_json_item.get("material_autoren"): if item.get("name") is not None: if item.get("name").strip() != "": authors.append(item.get("name")) lic.add_value("author", authors) base.add_value("valuespaces", vs.load_item()) base.add_value("license", lic.load_item()) permissions = super().getPermissions(response) base.add_value("permissions", permissions.load_item()) response_loader = ResponseItemLoader() response_loader.add_value("url", response.url) base.add_value("response", response_loader.load_item()) yield base.load_item()
def parse(self, response: scrapy.http.Response, **kwargs) -> BaseItemLoader: base = BaseItemLoader() # ALL possible keys for the different Item and ItemLoader-classes can be found inside converter/items.py # TODO: fill "base"-keys with values for # - sourceId required (see: getId()-method above) # - hash required (see: getHash()-method above) # - lom required (see: LomBaseItemLoader below) # - valuespaces required (see: ValueSpacesItemLoader below) # - permissions required (see: PermissionItemLoader below) # - license required (see: LicenseItemLoader below) # - lastModified recommended # - type recommended # - thumbnail recommended # - publisher optional base.add_value('sourceId', response.url) # if the source doesn't have a "datePublished" or "lastModified"-value in its header or JSON_LD, # you might have to help yourself with a unique string consisting of the datetime of the crawl + self.version hash_temp: str = "This string should consist of a date (publication date, preferably)" + self.version base.add_value('hash', hash_temp) last_modified = None base.add_value('lastModified', last_modified) # sometimes you might get a "type"-value from the JSON_LD. If it's not supplied by the website you're crawling, # you might need to use a constant: base.add_value('type', Constants.TYPE_MATERIAL) thumbnail_url: str = "This string should hold the thumbnail URL" base.add_value('thumbnail', thumbnail_url) lom = LomBaseItemloader() # TODO: afterwards fill up the LomBaseItem with # - LomGeneralItem required # - LomTechnicalItem required # - LomLifeCycleItem required (multiple possible) # - LomEducationalItem required # - LomClassificationItem optional general = LomGeneralItemloader() # TODO: fill "general"-keys with values for # - identifier required # - title required # - keyword required # - description required # - language recommended # - coverage optional # - structure optional # - aggregationLevel optional # e.g.: the unique identifier might be the URL to a material general.add_value('identifier', response.url) # TODO: don't forget to add key-value-pairs for 'title', 'keyword' and 'description'! # once we've added all available values to the necessary keys in our LomGeneralItemLoader, # we call the load_item()-method to return a (now filled) LomGeneralItem to the LomBaseItemLoader lom.add_value('general', general.load_item()) technical = LomTechnicalItemLoader() # TODO: fill "technical"-keys with values for # - format required (expected: MIME-type, e.g. 'text/html' for web-sites) # - location required (expected: URI / URL of a learning object / material) # - size optional # - requirement optional # - installationRemarks optional # - otherPlatformRequirements optional # - duration optional (only applies to audiovisual content like videos/podcasts) # similar to how the "general"-LomGeneralItemLoader was filled with Items, individual values can be set with # technical.add_value('key','value') # or replaced with: # technical.replace_value('key', 'value') technical.add_value( 'format', 'text/html') # e.g. if the learning object is a web-page technical.add_value( 'location', response.url ) # if the the learning object has a unique URL that's being # navigated by the crawler lom.add_value('technical', technical.load_item()) lifecycle = LomLifecycleItemloader() # TODO: fill "lifecycle"-keys with values for # - role recommended # - firstName recommended # - lastName recommended # - url recommended # - date recommended # - organization optional # - email optional # - uuid optional lifecycle.add_value( 'role', 'author') # supported roles: "author" / "editor" / "publisher" # for available roles mapping, please take a look at converter/es_connector.py lom.add_value('lifecycle', lifecycle.load_item()) educational = LomEducationalItemLoader() # TODO: fill "educational"-keys with values for # - description recommended (= "Comments on how this learning object is to be used") # - language recommended # - interactivityType optional # - interactivityLevel optional # - semanticDensity optional # - typicalAgeRange optional # - difficulty optional # - typicalLearningTime optional lom.add_value('educational', educational.load_item()) classification = LomClassificationItemLoader() # TODO: fill "classification"-keys with values for # - cost optional # - purpose optional # - taxonPath optional # - description optional # - keyword optional lom.add_value('classification', classification.load_item()) # once you've filled "general", "technical", "lifecycle" and "educational" with values, # the LomBaseItem is loaded into the "base"-BaseItemLoader base.add_value('lom', lom.load_item()) vs = ValuespaceItemLoader() # for possible values, either consult https://vocabs.openeduhub.de # or take a look at https://github.com/openeduhub/oeh-metadata-vocabs # TODO: fill "valuespaces"-keys with values for # - discipline recommended # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/discipline.ttl) # - intendedEndUserRole recommended # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/intendedEndUserRole.ttl) # - learningResourceType recommended # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/learningResourceType.ttl) # - conditionsOfAccess recommended # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/conditionsOfAccess.ttl) # - containsAdvertisement recommended # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/containsAdvertisement.ttl) # - price recommended # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/price.ttl) # - educationalContext optional # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/educationalContext.ttl) # - sourceContentType optional # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/sourceContentType.ttl) # - toolCategory optional # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/toolCategory.ttl) # - accessibilitySummary optional # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/accessibilitySummary.ttl) # - dataProtectionConformity optional # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/dataProtectionConformity.ttl) # - fskRating optional # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/fskRating.ttl) # - oer optional # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/oer.ttl) base.add_value('valuespaces', vs.load_item()) lic = LicenseItemLoader() # TODO: fill "license"-keys with values for # - url required # - oer recommended ('oer' is automatically set if the 'url'-field above # is recognized in LICENSE_MAPPINGS: for possible url-mapping values, please take a look at # LICENSE_MAPPINGS in converter/constants.py) # - author recommended # - internal optional # - description optional # - expirationDate optional (for content that expires, e.g. ÖR-Mediatheken) base.add_value('license', lic.load_item()) # Either fill the PermissionItemLoader manually (not necessary most of the times) permissions = PermissionItemLoader() # or (preferably) call the inherited getPermissions(response)-method # from converter/spiders/base_classes/lom_base.py by using super().: # permissions = super().getPermissions(response) # TODO: if necessary, add/replace values for the following "permissions"-keys # - public optional # - groups optional # - mediacenters optional # - autoCreateGroups optional # - autoCreateMediacenters optional base.add_value('permissions', permissions.load_item()) # Either fill the ResponseItemLoader manually (not necessary most of the time) response_loader = ResponseItemLoader() # or (preferably) call the inherited mapResponse(response)-method # from converter/spiders/base_classes/lom_base.py by using super().: # response_loader = super().mapResponse(response) # TODO: if necessary, add/replace values for the following "response"-keys # - url required # - status optional # - html optional # - text optional # - headers optional # - cookies optional # - har optional base.add_value('response', response_loader.load_item()) # once all scrapy.Item are loaded into our "base", we yield the BaseItem by calling the .load_item() method yield base.load_item()
def parse(self, response: scrapy.http.Response, **kwargs): """ Parses an individual 'worksheet' and combines the metadata with data from its 'bundle'-dictionary. Spider Contracts: @url https://editor.mnweg.org/mnw/dokument/vocabulary-around-the-world-3 @returns items 1 :return: yields a BaseItemLoader """ # since we're only parsing the first worksheet for some additional metadata, the metadata object will be # centered around a bundle, not the individual pages # print("DEBUG parse_worksheet_page", response.url) date_published = response.xpath( '//ul[@class="meta"]/li[3]/text()').get() base = BaseItemLoader() base.add_value("sourceId", kwargs.get('bundle_url')) hash_temp = str(date_published + self.version) base.add_value("hash", hash_temp) # this is a hacky solution: the thumbnail is the miniature preview of the bundle's first worksheet bundle_thumbnail = kwargs.get('bundle_thumbnail') if bundle_thumbnail is not None: base.add_value('thumbnail', bundle_thumbnail) base.add_value('type', Constants.TYPE_MATERIAL) base.add_value('lastModified', date_published) lom = LomBaseItemloader() general = LomGeneralItemloader() general.add_value('title', kwargs.get('bundle_title')) description_temp = str() bundle_desc_temp = kwargs.get('bundle_description') worksheet_desc_temp = kwargs.get('worksheet_description_summary') # not every bundle has a description, but there's always worksheet descriptions available: if bundle_desc_temp is not None: description_temp: str = bundle_desc_temp + "\n\n" + worksheet_desc_temp elif bundle_desc_temp is None and worksheet_desc_temp is not None: description_temp: str = worksheet_desc_temp # print(description_temp) general.add_value('description', description_temp) general.add_value('language', 'de') general.add_value('identifier', kwargs.get('bundle_url')) lom.add_value('general', general.load_item()) technical = LomTechnicalItemLoader() technical.add_value("format", "text/html") technical.add_value('location', kwargs.get('bundle_url')) lom.add_value('technical', technical.load_item()) lifecycle = LomLifecycleItemloader() bundle_organization: dict = kwargs.get('bundle_ld_json_organization') # the dictionary that we can parse from the website itself looks like this: # 'organization': {'@context': 'http://schema.org', # '@type': 'Organization', # 'name': 'Materialnetzwerk e. G.', # 'sameAs': ['http://twitter.com/materialnw', # 'https://www.facebook.com/materialnetzwerk'], # 'url': 'https://editor.mnweg.org'}} # TODO: once its possible to parse a 'organization'-schema-type as a dictionary by the back-end, use # lifecycle.add_value('organization', bundle_organization) if bundle_organization is not None: lifecycle.add_value('organization', bundle_organization.get("name")) lifecycle.add_value('url', bundle_organization.get("url")) lifecycle.add_value('date', date_published) lom.add_value('lifecycle', lifecycle.load_item()) educational = LomEducationalItemLoader() # TODO: educationalLevel is currently unsupported in the items.py backend? educational_level = kwargs.get('bundle_educational_level') if educational_level is not None: educational.add_value('educationalLevel', educational_level) lom.add_value('educational', educational.load_item()) base.add_value('lom', lom.load_item()) vs = ValuespaceItemLoader() vs.add_value('learningResourceType', 'teaching module') bundle_discipline = kwargs.get('bundle_discipline') if bundle_discipline is not None: if self.discipline_mapping.get(bundle_discipline) is not None: bundle_discipline = self.discipline_mapping.get( bundle_discipline) vs.add_value('discipline', bundle_discipline) vs.add_value('intendedEndUserRole', 'teacher') # logged in users can manipulate the worksheets and fit them to their needs, # but there's no login required for just downloading the pdf of an available worksheet vs.add_value('conditionsOfAccess', "login required for additional features") vs.add_value('price', 'no') # we can map "Phase" to our educationalContext with the following ValuespaceHelper method: if educational_level is not None: vs.add_value( "educationalContext", ValuespaceHelper.educationalContextByGrade(educational_level)) lic = LicenseItemLoader() # everything is CC-BY-SA 3.0 according to the FAQs: https://mnweg.org/faqs lic.add_value('url', Constants.LICENSE_CC_BY_SA_30) base.add_value('license', lic.load_item()) response_loader = ResponseItemLoader() response_loader.add_value('url', kwargs.get('bundle_url')) base.add_value('valuespaces', vs.load_item()) base.add_value('response', response_loader.load_item()) yield base.load_item()
def parse_site(self, response: scrapy.http.HtmlResponse, sitemap_entry: SitemapEntry = None): # extract the jsonld data = jslde.extract(response.text)[0] response.meta['sitemap_entry'] = sitemap_entry base = super().getBase(response=response) base.add_value("response", super().mapResponse(response).load_item()) # we assume that content is imported. Please use replace_value if you import something different base.add_value("type", Constants.TYPE_MATERIAL) base.add_value('thumbnail', data.get("thumbnailUrl", None)) base.add_value('lastModified', data.get("dateModified", None)) for publisher in data.get("publisher", []): # TODO add type, e.g. organization base.add_value("publisher", publisher.get("name")) lom = LomBaseItemloader() general = LomGeneralItemloader(response=response) general.add_value('title', data.get("name", None)) general.add_value('description', data.get("description", None)) general.add_value("identifier", data.get("identifier", None)) for language in data.get("language", []): general.add_value("language", language) lom.add_value("general", general.load_item()) technical = LomTechnicalItemLoader() technical.add_value('format', 'text/html') technical.add_value('location', sitemap_entry.loc) lom.add_value("technical", technical.load_item()) lifecycle = LomLifecycleItemloader() lom.add_value("lifecycle", lifecycle.load_item()) edu = LomEducationalItemLoader() lom.add_value("educational", edu.load_item()) # classification = LomClassificationItemLoader() # lom.add_value("classification", classification.load_item()) base.add_value("lom", lom.load_item()) vs = ValuespaceItemLoader() for audience in data.get("audience", []): vs.add_value("intendedEndUserRole", audience) for discipline in (d.strip() for d in data.get("about", []).split(",")): if discipline in about_maps.keys(): discipline = about_maps[discipline] vs.add_value('discipline', discipline) for lrt in data.get("type", []): vs.add_value('learningResourceType', lrt) base.add_value("valuespaces", vs.load_item()) lic = LicenseItemLoader() lic.add_value('url', data.get("license", None)) for creator in data.get("creator", []): lic.add_value("author", creator.get("name", "")) base.add_value("license", lic.load_item()) permissions = super().getPermissions(response) base.add_value("permissions", permissions.load_item()) response_loader = ResponseItemLoader() response_loader.add_value('url', response.url) base.add_value("response", response_loader.load_item()) yield base.load_item()
def parse(self, response: scrapy.http.Response, **kwargs): """ Parses an individual topic url for metadata and yields a BaseItem. Scrapy Contracts: @url https://www.umwelt-im-unterricht.de/hintergrund/generationengerechtigkeit-klimaschutz-und-eine-lebenswerte-zukunft/ @returns item 1 """ current_url: str = response.url base = BaseItemLoader() base.add_value('sourceId', response.url) date_raw: str = response.xpath( '//div[@class="b-cpsuiu-show-info"]/span/text()').get() date_cleaned_up: str = w3lib.html.strip_html5_whitespace(date_raw) hash_temp = str(date_cleaned_up + self.version) base.add_value('hash', hash_temp) base.add_value('lastModified', date_cleaned_up) base.add_value('type', Constants.TYPE_MATERIAL) # base.add_value('thumbnail', thumbnail_url) lom = LomBaseItemloader() general = LomGeneralItemloader() general.add_value('identifier', response.url) title: str = response.xpath( '//div[@class="tx-cps-uiu"]/article/h1/text()').get() general.add_value('title', title) keywords: list = response.xpath( '//div[@class="b-cpsuiu-show-keywords"]/ul/li/a/text()').getall() if len(keywords) >= 1: # only add keywords if the list isn't empty general.add_value('keyword', keywords) description: str = response.xpath( '/html/head/meta[@name="description"]/@content').get() general.add_value('description', description) general.add_value('language', 'de') lom.add_value('general', general.load_item()) technical = LomTechnicalItemLoader() technical.add_value('format', 'text/html') technical.add_value('location', response.url) lom.add_value('technical', technical.load_item()) lifecycle = LomLifecycleItemloader() lifecycle.add_value('role', 'publisher') lifecycle.add_value('date', date_cleaned_up) lifecycle.add_value('url', "https://www.umwelt-im-unterricht.de/impressum/") lifecycle.add_value( 'organization', 'Bundesministerium für Umwelt, Naturschutz und nukleare Sicherheit (BMU)' ) lom.add_value('lifecycle', lifecycle.load_item()) educational = LomEducationalItemLoader() educational.add_value('language', 'de') # TODO: a didactic comment could fit into either one of these: # - educational.description # - classification.description (with classification.purpose set to 'educational objective') if "/wochenthemen/" in current_url: # didactic comments are only part of "Thema der Woche" didactic_comment = response.xpath( '//div[@class="c-collapse-content js-collapse-content"]').get( ) if didactic_comment is not None: didactic_comment = w3lib.html.remove_tags(didactic_comment) # didactic_comment = w3lib.html.replace_escape_chars(didactic_comment, which_ones='\t', replace_by=" ") # didactic_comment = w3lib.html.replace_escape_chars(didactic_comment) didactic_comment = " ".join(didactic_comment.split()) if didactic_comment.endswith("mehr lesenweniger lesen"): # the button-description of the expandable info-box ends up in the string, # therefore we are manually removing it: didactic_comment = didactic_comment.replace( "mehr lesenweniger lesen", "") # since there's currently no way to confirm how the string looks in the web-interface: # ToDo: make sure which string format looks best in edu-sharing (cleaned up <-> with escape chars) educational.add_value('description', didactic_comment) lom.add_value('educational', educational.load_item()) classification = LomClassificationItemLoader() if "/unterrichtsvorschlaege/" in current_url: classification.add_value('purpose', 'competency') competency_description: list = response.xpath( '//div[@class="b-cpsuiu-show-description"]/*[not(' '@class="cc-licence-info")]').getall() # the xpath-expression for competency_description will grab the whole div-element, # but EXCLUDE the "license"-container (if the license-description exists, it's always part of the same div) if len(competency_description) >= 1: # only if the list of strings is not empty, we'll try to type-convert it to a string (and clean its # formatting up) competency_description: str = " ".join(competency_description) competency_description = w3lib.html.remove_tags( competency_description) classification.add_value('description', competency_description) lom.add_value('classification', classification.load_item()) base.add_value('lom', lom.load_item()) vs = ValuespaceItemLoader() # depending on the website-category, we need to set a specific learningResourceType # because the value 'website' for all crawled items would not be helpful enough if "/wochenthemen/" in current_url or "/unterrichtsvorschlaege/" in current_url: vs.add_value('learningResourceType', 'lesson plan') if "/hintergrund/" in current_url: vs.add_value('learningResourceType', 'Text') if "/medien/dateien/" in current_url: # topics categorized as "Arbeitsmaterial" offer customizable worksheets to teachers vs.add_value('learningResourceType', 'worksheet') if "/medien/videos/" in current_url: vs.add_value('learningResourceType', 'video') if "/medien/bilder/" in current_url: # topics categorized as "Bilderserie" hold several images in a gallery (with individual licenses) vs.add_value('learningResourceType', 'image') vs.add_value('price', 'no') vs.add_value('containsAdvertisement', 'no') vs.add_value('conditionsOfAccess', 'no login') vs.add_value('intendedEndUserRole', 'teacher') # see: https://www.umwelt-im-unterricht.de/ueber-umwelt-im-unterricht/ vs.add_value('accessibilitySummary', 'Not tested') # see: https://www.umwelt-im-unterricht.de/erklaerung-zur-barrierefreiheit/ vs.add_value('dataProtectionConformity', 'Sensible data collection') # see: https://www.umwelt-im-unterricht.de/datenschutz/ disciplines_raw: list = response.xpath( '//div[@class="b-cpsuiu-show-subjects"]/ul/li/a/text()').getall() if len(disciplines_raw) >= 1: disciplines = list() for discipline_value in disciplines_raw: # self.debug_discipline_values.add(discipline_value) if discipline_value in self.DISCIPLINE_MAPPING.keys(): discipline_value = self.DISCIPLINE_MAPPING.get( discipline_value) # since the mapping value can either be a single string OR a list of strings, we need to make sure that # our 'disciplines'-list is a list of strings (not a list with nested lists): if type(discipline_value) is list: disciplines.extend(discipline_value) else: disciplines.append(discipline_value) if len(disciplines) >= 1: vs.add_value('discipline', disciplines) educational_context_raw = response.xpath( '//div[@class="b-cpsuiu-show-targets"]/ul/li/a/text()').getall() if len(educational_context_raw) >= 1: # the educationalContext-mapping is only done when there's at least one educational_context found educational_context = list() for educational_context_value in educational_context_raw: # self.debug_educational_context_values.add(educational_context_value) if educational_context_value in self.EDUCATIONAL_CONTEXT_MAPPING.keys( ): educational_context_value = self.EDUCATIONAL_CONTEXT_MAPPING.get( educational_context_value) if type(educational_context_value) is list: educational_context.extend(educational_context_value) else: educational_context.append(educational_context_value) if len(educational_context) >= 1: vs.add_value('educationalContext', educational_context) base.add_value('valuespaces', vs.load_item()) lic = LicenseItemLoader() license_url: str = response.xpath( '//div[@class="cc-licence-info"]/p/a[@rel="license"]/@href').get() if license_url is not None: if license_url.startswith("http://"): # the license-mapper expects urls that are in https:// format, but UIU uses http:// links to CC-licenses license_url = license_url.replace("http://", "https://") lic.add_value('url', license_url) license_description_raw: str = response.xpath( '//div[@class="cc-licence-info"]').get() if license_description_raw is not None: license_description_raw = w3lib.html.remove_tags( license_description_raw) license_description_raw = w3lib.html.replace_escape_chars( license_description_raw, which_ones="\n", replace_by=" ") # if we would replace_escape_chars() straight away, there would be words stuck together that don't belong # together. just replacing \n with a whitespace is enough to keep the structure of the string intact. license_description_raw = w3lib.html.replace_escape_chars( license_description_raw) license_description = " ".join(license_description_raw.split()) # making sure that there's only 1 whitespace between words lic.add_value('description', license_description) base.add_value('license', lic.load_item()) permissions = super().getPermissions(response) base.add_value('permissions', permissions.load_item()) response_loader = super().mapResponse(response) base.add_value('response', response_loader.load_item()) yield base.load_item()