def parse_site(self,
                   response: scrapy.http.HtmlResponse,
                   sitemap_entry: SitemapEntry = None):
        """
        parses metadata from an individual item both by its HtmlResponse and its sitemap tags

        :param response: the current scrapy.http.HtmlResponse (needed for xpath and css selectors)
        :param sitemap_entry: a copy of the original sitemap entry for this item
        :return: yields a BaseItemLoader
        """
        response.meta['sitemap_entry'] = sitemap_entry
        base = super().getBase(response=response)
        base.add_value("response", super().mapResponse(response).load_item())
        # we assume that content is imported. Please use replace_value if you import something different
        base.add_value("type", Constants.TYPE_MATERIAL)
        # thumbnail_href = response.css('.post-thumbnail img::attr(src)').get()
        base.add_value('thumbnail',
                       response.css('.post-thumbnail img::attr(src)').get())
        base.add_value('lastModified', sitemap_entry.lastmod)

        lom = LomBaseItemloader()
        general = LomGeneralItemloader(response=response)
        # the CSS Selector for 7 items was sometimes empty, which caused the pipeline to drop the whole item
        # this if-condition should always grab a title
        title = response.css('.entry-title span::text').get()
        if title is not None:
            general.add_value('title',
                              response.css('.entry-title span::text').get())
        if title is None:
            general.add_value(
                'title',
                response.xpath('//*[@class="entry-title"]/text()').get())

        content = response.css('.entry-content')
        # remove the sharedaddy-buttons before parsing the description text
        content.css('.sharedaddy').remove()
        # TODO: attach pdf links (if available) to new items.py ItemLoader for deep-links (not yet implemented)
        # pdf_links = content.css('ul li a::attr(href)').getall()
        description_temp = content.xpath(
            '//*[@class="entry-content"]//descendant::*/text()').getall()
        raw_description = str()
        raw_description = raw_description.join(description_temp)

        # hacky, alternative method without removing the "sharedaddy"-container:
        #   even though the <div id="jp-post-flair">-container is completely separate from the "entry-content"-div
        #   it will grab the share-button descriptions. As a workaround, we're grabbing all descriptions, but manually
        #   break the loop as soon as we reach the "Teilen mit:"-String
        # raw_description = str()
        # for item in description_temp:
        #     if item.get() == "Teilen mit:":
        #         break
        #     raw_description += item.get()

        raw_description = w3lib.html.remove_tags(raw_description)
        general.add_value('description', raw_description)

        general.add_value('keyword',
                          response.css('.post-categories a::text').getall())
        lom.add_value("general", general.load_item())

        technical = LomTechnicalItemLoader()
        technical.add_value('format', 'text/html')
        technical.add_value('location', sitemap_entry.loc)
        lom.add_value("technical", technical.load_item())

        # lifecycle = LomLifecycleItemloader()
        # lom.add_value("lifecycle", lifecycle.load_item())

        edu = LomEducationalItemLoader()
        lom.add_value("educational", edu.load_item())

        # classification = LomClassificationItemLoader()
        # lom.add_value("classification", classification.load_item())

        base.add_value("lom", lom.load_item())

        vs = ValuespaceItemLoader()
        vs.add_value('intendedEndUserRole', 'teacher')
        vs.add_value('discipline', 'Allgemein')
        vs.add_value('educationalContext', 'Elementarbereich')
        # vs.add_value('toolCategory', 'noGeneralDataProtectionRegulation')
        vs.add_value('learningResourceType', 'other_asset_type')
        base.add_value("valuespaces", vs.load_item())

        lic = LicenseItemLoader()
        lic.add_value('url', Constants.LICENSE_CC_ZERO_10)
        base.add_value("license", lic.load_item())

        permissions = super().getPermissions(response)
        base.add_value("permissions", permissions.load_item())

        response_loader = ResponseItemLoader()
        response_loader.add_value('url', response.url)
        base.add_value("response", response_loader.load_item())

        yield base.load_item()
    def parse(self, response: scrapy.http.Response, **kwargs):
        """

        Scrapy Contracts:
        @url http://ginkgomaps.com/landkarten_deutschland.html
        @returns items 1
        """
        # making sure that the current url is marked as parsed:
        self.debug_parsed_urls.add(response.url)

        # IMPORTANT: modern browsers add "tbody"-elements into tables, scrapy doesn't see those tags!
        #   Remember: whatever request you see with the developer tools in your browser, you need to manually remove
        #   ANY <tbody>-tag that sits inside your xpath expression, otherwise it will return an empty [] !
        #       response.xpath('/html/body/center/table[1]/tr[4]/td[3]/table[1]').get()

        # first index page contains 42 maps, all inside tables of the class "smalltable":
        # response.xpath('//table[@class="smalltable"]')

        table_body = response.xpath('//table[@class="smalltable"]')
        description_temp = str()
        first_thumbnail = str()
        if table_body is not None:
            for table_item in table_body:
                # print(table_item.get())
                map_title = table_item.xpath('tr/td[1]/a[2]/text()').get()
                map_design_heading = table_item.xpath(
                    'tr/td[2]/u[1]/text()').get()
                map_design = table_item.xpath('tr/td[2]/p[1]/text()').get()
                map_content_heading = table_item.xpath(
                    'tr/td[2]/u[2]/text()').get()
                map_content = table_item.xpath('tr/td[2]/p[2]/text()').get()
                # map_thumbnail = response.urljoin(table_item.xpath('tr/td[1]/a[1]/img/@src').get())
                # map_thumbnail_description = table_item.xpath('tr/td[1]/a[1]/img/@alt').get()

                # pdf_download_url = response.urljoin(table_item.xpath('tr/td[2]/p[3]/a[1]/@href').get())
                # pdf_download_title = table_item.xpath('tr/td[2]/p[3]/a[2]/text()').get()
                # jpeg_download_medium_url = response.urljoin(table_item.xpath('tr/td[2]/p[4]/a[2]/@href').get())
                # jpeg_download_medium_description = table_item.xpath('tr/td[2]/p[4]/a[2]/text()').get()
                # jpeg_download_high_url = response.urljoin(table_item.xpath('tr/td[2]/p[5]/a[2]/@href').get())
                # jpeg_download_high_description = table_item.xpath('tr/td[2]/p[5]/a[2]/text()').get()

                description_temp += map_title + "\n" \
                    + map_design_heading + map_design \
                    + map_content_heading + map_content
            # while we could theoretically grab all thumbnails during the above loop,
            # the first one is enough for a preview-image in edu-sharing
            first_thumbnail = response.urljoin(
                table_body[0].xpath('tr/td[1]/a[1]/img/@src').get())

        description_temp = w3lib.html.strip_html5_whitespace(description_temp)

        base = super().getBase(response=response)
        base.add_value('sourceId', response.url)

        last_modified = response.xpath('/html/head/meta[6]/@content').get()
        hash_temp = last_modified + self.version
        base.add_value('hash', hash_temp)
        base.add_value('type', Constants.TYPE_MATERIAL)
        if first_thumbnail is not None:
            base.add_value('thumbnail', first_thumbnail)
        base.add_value('lastModified', last_modified)

        lom = LomBaseItemloader()
        general = LomGeneralItemloader(response=response)
        general.add_value('language', 'de')
        general.add_value('identifier', response.url)
        # the description could be extended with additional infos about the map-formats and their resolutions,
        # (if necessary)
        general.add_value('description', description_temp)
        general.add_value('title',
                          response.xpath('/html/head/title/text()').get())
        # keywords are stored inside a String, separated by commas with (sometimes multiple) whitespaces,
        # therefore RegEx is needed to provide a list with individual keywords since a String.split() isn't enough:
        keyword_string = response.xpath(
            '/html/head/meta[@name="keywords"]/@content').get()
        kw_regex_split = re.split(r'\s*,\s*', keyword_string)
        general.add_value('keyword', kw_regex_split)
        lom.add_value('general', general.load_item())

        technical = LomTechnicalItemLoader()
        technical.add_value('format', 'text/html')
        technical.add_value('location', response.url)
        lom.add_value('technical', technical.load_item())

        lifecycle = LomLifecycleItemloader()
        lifecycle.add_value('date', last_modified)
        lifecycle.add_value('role', 'author')
        lifecycle.add_value('firstName', 'Dirk')
        lifecycle.add_value('lastName', 'Benkert')
        lifecycle.add_value('organization', 'Ginkgomaps')
        lifecycle.add_value('url', 'https://dirkbenkert.com/')
        lom.add_value('lifecycle', lifecycle.load_item())

        educational = LomEducationalItemLoader()
        # since the learning objects are maps, expositive seems to be the best fit for interactivityType:
        educational.add_value('interactivityType', 'expositive')
        lom.add_value('educational', educational.load_item())
        base.add_value('lom', lom.load_item())

        vs = ValuespaceItemLoader()
        # since no educationalContext is given, either hardcode these values or don't use them at all
        # vs.add_value('educationalContext', ["Sekundarstufe I",
        #                                     "Sekundarstufe II",
        #                                     "Berufliche Bildung",
        #                                     "Erwachsenenbildung"])
        vs.add_value('intendedEndUserRole', ["learner", "teacher", "parent"])
        vs.add_value('discipline', 'Geografie')  # Geografie
        vs.add_value('learningResourceType', 'map')  # Karte
        vs.add_value('conditionsOfAccess', 'no login')

        lic = LicenseItemLoader()
        # if needed, the license description could also be gathered and constructed from multiple tags within a
        # container: /html/body/center/table[1]/tbody/tr[5]/td[2]/p
        license_url: str = response.xpath(
            '/html/body/center/table[1]/tr[5]/td[2]/p/a/@href').get()
        if (license_url is not None) and (license_url.endswith("deed.de")):
            license_url = license_url[:-len("deed.de")]
            license_url = license_url.replace("http://", "https://")
            lic.add_value('url', license_url)
        lic.add_value('author',
                      response.xpath('/html/head/meta[3]/@content').get())

        base.add_value('valuespaces', vs.load_item())
        base.add_value('license', lic.load_item())

        permissions = super().getPermissions(response)
        base.add_value('permissions', permissions.load_item())

        base.add_value('response', super().mapResponse(response).load_item())

        yield base.load_item()
    def parse(self, response: scrapy.http.Response, **kwargs):
        base = super().getBase(response=response)
        # there are no suitable images to serve as thumbnails, therefore SPLASH will have to do
        base.add_value('type', Constants.TYPE_MATERIAL)

        lom = LomBaseItemloader()
        general = LomGeneralItemloader(response=response)
        # description_raw = response.xpath('/html/body/table/tr[4]/td/table/tr/td').get()
        description_raw = response.xpath(
            '//descendant::td[@class="t1fbs"]').getall()
        description_raw: str = ''.join(description_raw)
        if description_raw is not None:
            description_raw = w3lib.html.remove_tags(description_raw)
            description_raw = w3lib.html.strip_html5_whitespace(
                description_raw)
            clean_description = w3lib.html.replace_escape_chars(
                description_raw)
            general.add_value('description', clean_description)
        if len(description_raw) == 0:
            # Fallback for exercise-pages where there's only 1 title field and 1 short instruction sentence
            # e.g.: http://www.zum.de/dwu/depothp/hp-phys/hppme24.htm
            description_fallback = response.xpath(
                '//descendant::div[@id="InstructionsDiv"]/descendant'
                '::*/text()').get()
            general.replace_value('description', description_fallback)
        # most of the time the title is stored directly
        title: str = response.xpath('/html/head/title/text()').get()
        if title.startswith("Dieses Info-Fenster"):
            # some subpages carry "Dieses Info-Fenster bleibt bis zum Schließen im Vordergrund" as their title,
            # therefore we need to grab the title from a better suited element.
            # This also means that the "description" is most probably wrong and needs a replacement as well:
            title = response.xpath('//td[@class="tt1math"]/text()').get()
            title = title.strip()
            # desc_list = response.xpath('/html/body/table[2]/tr/td/table/tr[1]/td[1]/text()').getall()
            desc_list = response.xpath('//td[@class="t1fbs"]/text()').getall()
            if desc_list is not None and len(desc_list) == 0:
                # if the first attempt at grabbing a description fails, we try it at another place
                desc_list = response.xpath('//td[@class="sg12"]/text()').get()
            if desc_list is not None:
                description_raw = ''.join(desc_list)
                # if there's multiple whitespaces within the description, replace them by a single whitespace:
                description_raw = re.sub(' +', ' ', description_raw)
                clean_description = w3lib.html.replace_escape_chars(
                    description_raw)
                general.replace_value('description', clean_description)

        if title is not None:
            title = w3lib.html.replace_escape_chars(title)
            if title is not None:
                # this double-check is necessary for broken headings that ONLY consisted of escape-chars
                if title == '':
                    # there's some pages (Exercises) that only hold escape chars or whitespaces as their title
                    # the title is simply bold text hidden within a div container
                    title = response.xpath(
                        '//div[@class="Titles"]/h3[@class="ExerciseSubtitle"]/b/text()'
                    ).get()
                title = title.strip()
                # Since we're grabbing titles from headings, a lot of them have a trailing ":"
                if len(title) > 0 and title.endswith(":"):
                    # replacing the string with itself right up to the point of the colon
                    title = title[:-1]
            general.add_value('title', title)

        general.add_value('identifier', response.url)
        general.add_value('language', 'de')
        # on the vast majority of .htm pages the keywords sit in the http-equiv content tag
        keyword_string = response.xpath(
            '/html/head/meta[@http-equiv="keywords"]/@content').get()
        if keyword_string is None:
            # but on some sub-pages, especially the interactive javascript pages, the keywords are in another container
            keyword_string = response.xpath(
                '/html/head/meta[@name="keywords"]/@content').get()
        if keyword_string is not None:
            keyword_list = keyword_string.rsplit(", ")
            # trying to catch the completely broken keyword strings to clean them up manually
            # e.g. at http://www.zum.de/dwu/depothp/hp-math/hpmz21.htm check XPath: /html/head/meta[2]
            kw_set = set()
            if keyword_list[0].endswith(","):
                # broken keyword list detected, now we have to manually clean the string up
                broken_keyword_string: str = response.xpath(
                    '//meta[@name="keywords"]').get()
                broken_keyword_list = broken_keyword_string.replace('<meta name="keywords" content=', "") \
                    .replace(">", "").replace('"', "").replace(",", "").replace("=", "").split(" ")
                for item in broken_keyword_list:
                    kw_set.add(item.strip())
            if len(kw_set) == 0:
                # if there was no broken keyword meta field found, this condition always triggers
                kw_set = set(keyword_list)
            # checking if the keywords appear on the set of unwanted keywords, if they do, throw them away and only
            # keep the valid ones
            kw_set.difference_update(self.keywords_to_ignore)
            # once this check is done, add the keywords from the (cleaned up) keyword set
            keyword_list = list(kw_set)
            general.add_value('keyword', keyword_list)
        lom.add_value('general', general.load_item())

        technical = LomTechnicalItemLoader()
        technical.add_value('format', 'text/html')
        technical.add_value('location', response.url)
        lom.add_value('technical', technical.load_item())

        lifecycle = LomLifecycleItemloader()
        lifecycle.add_value('role', 'author')
        lifecycle.add_value('firstName', 'Dieter')
        lifecycle.add_value('lastName', 'Welz')
        lifecycle.add_value('url', '*****@*****.**')
        lifecycle.add_value(
            'organization',
            response.xpath(
                '/html/head/meta[@http-equiv="organization"]/@content').get())
        lom.add_value('lifecycle', lifecycle.load_item())

        educational = LomEducationalItemLoader()
        lom.add_value('educational', educational.load_item())

        base.add_value('lom', lom.load_item())

        vs = ValuespaceItemLoader()
        # since the website holds both mathematics- and physics-related materials, we need to take a look at the last
        # section of the url: .htm filenames that start with
        #   m | hpm | tkm       belong to the discipline mathematics
        #   p | kwp | hpp       belong to the discipline physics
        url_last_part = response.url
        url_last_part = url_last_part.split('/')[-1]
        if url_last_part.startswith("m") or url_last_part.startswith(
                "hpm") or url_last_part.startswith("tkm"):
            vs.add_value('discipline', 'Mathematics')
        if url_last_part.startswith("p") or url_last_part.startswith("kwp") or url_last_part.startswith("hpp") \
                or url_last_part.startswith("vcp"):
            vs.add_value('discipline', "Physics")
        vs.add_value('learningResourceType', Constants.TYPE_MATERIAL)
        vs.add_value('intendedEndUserRole', [
            'learner',
            'teacher',
            'parent',
        ])
        vs.add_value('price', 'no')
        vs.add_value('conditionsOfAccess', 'no login')

        lic = LicenseItemLoader()
        lic.add_value('description', 'http://www.zum.de/dwu/hilfe.htm')
        lic.add_value('internal', Constants.LICENSE_CUSTOM)
        lic.add_value(
            'author',
            response.xpath(
                '/html/head/meta[@http-equiv="author"]/@content').get())

        base.add_value('valuespaces', vs.load_item())
        base.add_value('license', lic.load_item())

        permissions = super().getPermissions(response)
        base.add_value('permissions', permissions.load_item())

        base.add_value('response', super().mapResponse(response).load_item())

        # print(self.parsed_urls)
        # print("debug_url_set length:", len(self.parsed_urls))

        yield base.load_item()
Exemple #4
0
    def parse(self, response: scrapy.http.Response, **kwargs) -> BaseItemLoader:
        """

        Scrapy Contracts:
        @url https://kmap.eu/app/browser/Mathematik/Exponentialfunktionen/Asymptoten
        @returns item 1
        """
        last_modified = kwargs.get("lastModified")
        url_data_splash_dict = WebTools.getUrlData(response.url, engine=WebEngine.Pyppeteer)
        splash_html_string = url_data_splash_dict.get('html')
        json_ld_string: str = Selector(text=splash_html_string).xpath('//*[@id="ld"]/text()').get()
        json_ld: dict = json.loads(json_ld_string)
        # TODO: skip item method - (skips item if it's an empty knowledge map)

        base = BaseItemLoader()
        base.add_value('sourceId', response.url)
        hash_temp = json_ld.get("mainEntity").get("datePublished")
        hash_temp += self.version
        base.add_value('hash', hash_temp)
        base.add_value('lastModified', last_modified)
        base.add_value('type', Constants.TYPE_MATERIAL)
        # Thumbnails have their own url path, which can be found in the json+ld:
        #   "thumbnailUrl": "/snappy/Physik/Grundlagen/Potenzschreibweise"
        # e.g. for the item https://kmap.eu/app/browser/Physik/Grundlagen/Potenzschreibweise
        # the thumbnail can be found at https://kmap.eu/snappy/Physik/Grundlagen/Potenzschreibweise
        thumbnail_path = json_ld.get("mainEntity").get("thumbnailUrl")
        if thumbnail_path is not None:
            base.add_value('thumbnail', 'https://kmap.eu' + thumbnail_path)

        lom = LomBaseItemloader()
        general = LomGeneralItemloader()
        general.add_value('identifier', json_ld.get("mainEntity").get("mainEntityOfPage"))
        keywords_string: str = json_ld.get("mainEntity").get("keywords")
        keyword_list = keywords_string.rsplit(", ")
        general.add_value('keyword', keyword_list)
        general.add_value('title', json_ld.get("mainEntity").get("name"))
        general.add_value('description', json_ld.get("mainEntity").get("description"))
        general.add_value('language', json_ld.get("mainEntity").get("inLanguage"))
        lom.add_value('general', general.load_item())

        technical = LomTechnicalItemLoader()
        technical.add_value('format', 'text/html')
        technical.add_value('location', response.url)
        lom.add_value('technical', technical.load_item())

        lifecycle = LomLifecycleItemloader()
        lifecycle.add_value('role', 'publisher')
        lifecycle.add_value('organization', json_ld.get("mainEntity").get("publisher").get("name"))
        author_email = json_ld.get("mainEntity").get("publisher").get("email")
        if author_email is not None:
            lifecycle.add_value('email', author_email)
        lifecycle.add_value('url', 'https://kmap.eu/')
        lifecycle.add_value('date', json_ld.get("mainEntity").get("datePublished"))
        lom.add_value('lifecycle', lifecycle.load_item())

        educational = LomEducationalItemLoader()
        lom.add_value('educational', educational.load_item())
        base.add_value('lom', lom.load_item())

        vs = ValuespaceItemLoader()
        vs.add_value('discipline', json_ld.get("mainEntity").get("about"))
        vs.add_value('intendedEndUserRole', json_ld.get("mainEntity").get("audience"))
        vs.add_value('learningResourceType', json_ld.get("mainEntity").get("learningResourceType"))
        vs.add_value('price', 'no')
        vs.add_value('conditionsOfAccess', 'login required for additional features')
        base.add_value('valuespaces', vs.load_item())

        lic = LicenseItemLoader()
        lic.add_value('author', json_ld.get("mainEntity").get("author").get("name"))
        lic.add_value('url', json_ld.get("mainEntity").get("license"))
        base.add_value('license', lic.load_item())

        permissions = super().getPermissions(response)
        base.add_value("permissions", permissions.load_item())

        base.add_value('response', super().mapResponse(response).load_item())

        return base.load_item()
    def parse(self, response: scrapy.http.Response, **kwargs):
        """
        Populates a BaseItemLoader with metadata and yields the BaseItem afterwards.

        Scrapy Contracts:
        @url https://www.walter-fendt.de/html5/mde/pythagoras2_de.htm
        @returns items 1
        """
        # fetching publication date and lastModified from dynamically loaded <p class="Ende">-element:
        url_data_splash_dict = WebTools.getUrlData(response.url,
                                                   engine=WebEngine.Pyppeteer)
        splash_html_string = url_data_splash_dict.get('html')
        page_end_element = Selector(
            text=splash_html_string).xpath('//p[@class="Ende"]').get()
        line_regex = re.compile(r'<br>')
        page_end_string = line_regex.split(page_end_element)
        published_date = None
        last_modified = None
        # the two strings inside the <p>-Container will look like this:
        # Walter Fendt, 2. November 2000
        # Letzte Änderung: 17. Oktober 2017
        # therefore we'll need to extract the dates by splitting up the strings
        for temp_string in page_end_string:
            if temp_string.startswith("Walter Fendt"):
                sentence1 = temp_string.rsplit(', ')
                # each "sentence" list now holds exactly 2 elements, whereby the last element should be the date
                for item in sentence1:
                    if dateparser.parse(item) is not None:
                        published_date = dateparser.parse(item)
            if temp_string.startswith('Letzte Änderung:'):
                sentence2 = temp_string.rsplit(': ')
                for item2 in sentence2:
                    if dateparser.parse(item2) is not None:
                        last_modified = dateparser.parse(item2)

        base = super().getBase(response=response)
        base.add_value('type', Constants.TYPE_MATERIAL)
        if last_modified is not None:
            hash_temp = last_modified.isoformat() + self.version
            base.add_value('hash', hash_temp)
            base.add_value('lastModified', last_modified.isoformat())
        base.add_value('sourceId', response.url)

        lom = LomBaseItemloader()
        general = LomGeneralItemloader(response=response)
        general.add_value('identifier', response.url)
        general.add_value('title',
                          response.xpath('/html/head/title/text()').get())
        general.add_value(
            'description',
            response.xpath(
                '/html/head/meta[@name="description"]/@content').get())
        keywords_string: str = response.xpath(
            '/html/head/meta[@name="keywords"]/@content').get()
        if keywords_string is not None:
            keyword_list = keywords_string.rsplit(", ")
            general.add_value('keyword', keyword_list)
        general.add_value('language', 'de')
        lom.add_value('general', general.load_item())

        technical = LomTechnicalItemLoader()
        technical.add_value('format', "text/html")
        technical.add_value('location', response.url)
        lom.add_value('technical', technical.load_item())

        lifecycle = LomLifecycleItemloader()
        lifecycle.add_value('role', 'author')
        lifecycle.add_value('firstName', 'Walter')
        lifecycle.add_value('lastName', 'Fendt')
        lifecycle.add_value(
            'url', "https://www.walter-fendt.de/wf.htm")  # author information
        if published_date is not None:
            lifecycle.add_value('date', published_date.isoformat())
        lom.add_value('lifecycle', lifecycle.load_item())

        educational = LomEducationalItemLoader()
        educational.add_value('interactivityType', 'mixed')
        lom.add_value('educational', educational.load_item())

        base.add_value('lom', lom.load_item())

        vs = ValuespaceItemLoader()
        vs.add_value('conditionsOfAccess', 'no login')
        vs.add_value('discipline', 'Mathematik')
        vs.add_value('intendedEndUserRole', ['learner', 'teacher', 'parent'])
        vs.add_value('learningResourceType', ['application', 'web page'])
        vs.add_value('price', 'no')
        base.add_value('valuespaces', vs.load_item())

        lic = LicenseItemLoader()
        lic.add_value('author', 'Walter Fendt')
        # if scrapy could render the <p class="Ende">-element, the license url could be found with the following XPath:
        # license_url = response.xpath('//p[@class="Ende"]/a[@rel="license"]/@href')
        # but since scrapy can't "see" this container, we're extracting the information with scrapy-splash
        license_url: str = Selector(text=splash_html_string).xpath(
            '//p[@class="Ende"]/a[@rel="license"]/@href').get()
        if license_url is not None:
            if license_url.startswith("http://"):
                license_url = license_url.replace("http://", "https://")
            # the license url links to the /de/ version, which currently doesn't get mapped properly
            # "https://creativecommons.org/licenses/by-nc-sa/3.0/de/"
            # -> 'https://creativecommons.org/licenses/by-nc-sa/3.0/' is the url-format we want
            if "creativecommons.org/licenses/" in license_url and license_url.endswith(
                    "/de/"):
                license_url = license_url.split("de/")[0]
            lic.add_value('url', license_url)
        base.add_value('license', lic.load_item())

        permissions = super().getPermissions(response)
        base.add_value('permissions', permissions.load_item())

        # TODO: fix super().mapResponse
        base.add_value('response', super().mapResponse(response).load_item())

        yield base.load_item()
Exemple #6
0
    def get_metadata_from_review_url(self, response: scrapy.http.Response, **kwargs):
        """
        grabs metadata from the "material_review_url"-page and uses the wp_json_item from the
        "parse_page"-method to return a BaseItemLoader with the combined metadata from both sources.

        :param response: the scrapy.http.Response object for the currently parsed page
        :param kwargs: wp_json_item-dictionary
        """
        # logging.debug("DEBUG inside get_metadata_from_review_url: wp_json_item id", kwargs.get("id"))
        wp_json_item = kwargs.get("item")
        # logging.debug("DEBUG inside get_metadata_from_review_url: response type = ", type(response),
        #               "url =", response.url)

        ld_json_string = response.xpath('/html/head/script[@type="application/ld+json"]/text()').get().strip()
        ld_json_string = html.unescape(ld_json_string)

        ld_json = json.loads(ld_json_string)

        hash_temp: Optional[str] = None
        language_temp: Optional[str] = None
        pub_date: Optional[str] = None
        organization_id: Optional[str] = None
        organization_name: Optional[str] = None
        date_modified: Optional[str] = None
        # this is a workaround to make sure that we actually grab the following data,
        # no matter where they are positioned in the list:
        #   - dateModified
        #   - inLanguage
        #   - datePublished
        #   - organization_name and url
        # e.g.: since there seems to be fluctuation how many elements the "@graph"-Array holds, we can't be sure
        # which position "dateModified" actually has:
        # sometimes it's ld_json.get("@graph")[2], sometimes on [3] etc., therefore we must check all of them
        ld_graph_items = ld_json.get("@graph")
        for item in ld_graph_items:
            if item.get("dateModified") is not None:
                date_modified = item.get("dateModified")  # this can be used instead of 'date' in lastModified
                hash_temp = item.get("dateModified") + self.version
            if item.get("@type") == "WebSite":
                language_temp = item.get("inLanguage")
            if item.get("@type") == "WebPage":
                pub_date = item.get("datePublished")
            if item.get("@type") == "Organization":
                organization_id = item.get("@id")
                organization_name = item.get("name")

        base = BaseItemLoader()
        base.add_value("sourceId", response.url)
        base.add_value("hash", hash_temp)

        # base.add_value("response", super().mapResponse(response).load_item())

        base.add_value("type", Constants.TYPE_MATERIAL)  # TODO: is this correct? use mapping for edu-context?
        base.add_value("thumbnail", wp_json_item.get("material_screenshot"))
        # base.add_value("lastModified", wp_json_item.get("date"))  # is "date" from wp_json for lastModified correct?
        base.add_value("lastModified", date_modified)  # or is this one better (grabbed from from material_review_url)?

        lom = LomBaseItemloader()
        general = LomGeneralItemloader(response=response)
        general.add_value("title", wp_json_item.get("material_titel"))

        # the source material heavily fluctuates between perfectly fine strings and messy (hardcoded) html tags
        # as well as "\n" and "\t", therefore we need to clean up that String first:
        raw_description = wp_json_item.get("material_beschreibung")
        raw_description = w3lib.html.remove_tags(raw_description)
        raw_description = w3lib.html.strip_html5_whitespace(raw_description)
        clean_description = w3lib.html.replace_escape_chars(raw_description)
        general.add_value("description", clean_description)

        general.add_value("identifier", wp_json_item.get("id"))
        if language_temp is not None:
            general.add_value("language", language_temp)

        kw_temp = list()
        for item in wp_json_item.get("material_schlagworte"):
            kw_temp.append(item.get("name"))
        general.add_value("keyword", kw_temp)
        lom.add_value("general", general.load_item())

        technical = LomTechnicalItemLoader()

        technical.add_value("format", "text/html")
        technical.add_value("location", wp_json_item.get("material_review_url"))
        lom.add_value("technical", technical.load_item())

        lifecycle = LomLifecycleItemloader()
        if organization_name is not None:
            lifecycle.add_value("organization", organization_name)
        if organization_id is not None:
            lifecycle.add_value("url", organization_id)
        if pub_date is not None:
            lifecycle.add_value("date", pub_date)

        lom.add_value("lifecycle", lifecycle.load_item())

        educational = LomEducationalItemLoader()

        if wp_json_item.get("material_altersstufe") is not None:
            # age range is returned as a list of <from_age>-<to_age>-Strings, possible return values are:
            # e.g. "01-05", "05-10", "10-13", "13-15", "15-19" and "18-99"
            age_regex = re.compile(r'(\d{1,2})-(\d{1,2})')
            age_range = set()
            age_range_item_loader = LomAgeRangeItemLoader()
            for item in wp_json_item.get("material_altersstufe"):
                age_range_temp = item.get("name")
                age_from = str(age_regex.search(age_range_temp).group(1))
                age_to = str(age_regex.search(age_range_temp).group(2))
                age_range.add(age_from)
                age_range.add(age_to)
            # print("FINAL AGE_RANGE: min = ", min(age_range), " max = ", max(age_range))
            if len(age_range) != 0:
                age_range_item_loader.add_value("fromRange", min(age_range))
                age_range_item_loader.add_value("toRange", max(age_range))
                educational.add_value("typicalAgeRange", age_range_item_loader.load_item())

        lom.add_value("educational", educational.load_item())
        base.add_value("lom", lom.load_item())

        vs = ValuespaceItemLoader()
        vs.add_value("discipline", "http://w3id.org/openeduhub/vocabs/discipline/520")  # Religion
        # mapping educationalContext
        educational_context = list()
        for edu_con_item in wp_json_item.get("material_bildungsstufe"):
            educational_context.append(edu_con_item.get("name"))
        for edu_item in educational_context:
            if edu_item in self.mapping_edu_context.keys():
                edu_item = self.mapping_edu_context.get(edu_item)
            if edu_item != "":
                vs.add_value("educationalContext", edu_item)

        # using mapped media_type_list for valuespaces -> learningResourceType
        media_type_list = list()
        for item in wp_json_item.get("material_medientyp"):
            media_type_list.append(item.get("name"))
        for media_type_item in media_type_list:
            if media_type_item in self.mapping_media_types.keys():
                media_type_item = self.mapping_media_types.get(media_type_item)
            if media_type_item != "":
                vs.add_value("learningResourceType", media_type_item)
        # see: https://vocabs.openeduhub.de/w3id.org/openeduhub/vocabs/learningResourceType/index.html

        # there's metadata for "Kompetenzen" (e.g.: "Deuten", "Gestalten", "Reflexion") within the returned wp_json
        # that our data-model doesn't support yet. for future reference though:
        #   wp_json_item.get("material_kompetenzen") -> list

        vs.add_value("intendedEndUserRole", "teacher")

        lic = LicenseItemLoader()

        license_regex_nc_reuse = re.compile(r'Zur nicht kommerziellen Wiederverwendung gekennzeichnet')
        license_regex_nc_reuse_and_change = re.compile(
            r'Zur nicht kommerziellen Wiederverwendung und Veränderung gekennzeichnet')

        # important clarification from rpi-virtuell:
        #   'frei zugänglich' describes 'ungeklärte Lizenz' / 'volles Urheberrecht'
        #   CC licenses > 'frei zugänglich' if both values are found in the license description
        license_regex_free_access = re.compile(r'frei zugänglich')
        license_regex_free_after_signup = re.compile(r'kostenfrei nach Anmeldung')
        license_regex_with_costs = re.compile(r'kostenpflichtig')

        license_description = response.xpath('//div[@class="material-detail-meta-access material-meta"]'
                                             '/div[@class="material-meta-content-entry"]/text()').get()

        if license_description is not None:
            license_description = html.unescape(license_description.strip())
            lic.add_value("description", license_description)

            cc_by_nc_nd = license_regex_nc_reuse.search(license_description)
            cc_by_nc_sa = license_regex_nc_reuse_and_change.search(license_description)
            # if the RegEx search finds something, it returns a match-object. otherwise by default it returns None
            if cc_by_nc_nd is not None:
                lic.add_value("url", Constants.LICENSE_CC_BY_NC_ND_40)
            if cc_by_nc_sa is not None:
                lic.add_value("url", Constants.LICENSE_CC_BY_NC_SA_30)
            # if a material is "frei zugänglich", set price to none, but don't override a previously set CC-license
            if license_regex_free_access.search(license_description) is not None:
                vs.add_value("price", "no")
                # only if "frei zugänglich" is the only license-description this will trigger:
                # see https://rpi-virtuell.de/nutzungsbedingungen/ (5.)
                if license_regex_free_access.match(license_description) is not None:
                    lic.add_value("url", Constants.LICENSE_CC_BY_SA_40)
            if license_regex_with_costs.search(license_description):
                lic.add_value("internal", Constants.LICENSE_COPYRIGHT_LAW)
                vs.add_value("price", "yes")
            if license_regex_free_after_signup.search(license_description):
                vs.add_value("price", "yes")
                vs.add_value("conditionsOfAccess", "login")
        else:
            # by default, all materials should be CC_BY_SA - according to the rpi-virtuell ToS
            lic.replace_value("url", Constants.LICENSE_CC_BY_SA_40)
        authors = list()
        # the author should end up in LOM lifecycle, but the returned metadata are too messily formatted to parse them
        # by easy patterns like (first name) + (last name)
        for item in wp_json_item.get("material_autoren"):
            if item.get("name") is not None:
                if item.get("name").strip() != "":
                    authors.append(item.get("name"))
        lic.add_value("author", authors)

        base.add_value("valuespaces", vs.load_item())

        base.add_value("license", lic.load_item())

        permissions = super().getPermissions(response)
        base.add_value("permissions", permissions.load_item())

        response_loader = ResponseItemLoader()
        response_loader.add_value("url", response.url)
        base.add_value("response", response_loader.load_item())

        yield base.load_item()
Exemple #7
0
    def parse(self, response: scrapy.http.Response,
              **kwargs) -> BaseItemLoader:
        base = BaseItemLoader()
        # ALL possible keys for the different Item and ItemLoader-classes can be found inside converter/items.py

        # TODO: fill "base"-keys with values for
        #  - sourceId           required    (see: getId()-method above)
        #  - hash               required    (see: getHash()-method above)
        #  - lom                required    (see: LomBaseItemLoader below)
        #  - valuespaces        required    (see: ValueSpacesItemLoader below)
        #  - permissions        required    (see: PermissionItemLoader below)
        #  - license            required    (see: LicenseItemLoader below)
        #  - lastModified       recommended
        #  - type               recommended
        #  - thumbnail          recommended
        #  - publisher          optional
        base.add_value('sourceId', response.url)
        # if the source doesn't have a "datePublished" or "lastModified"-value in its header or JSON_LD,
        # you might have to help yourself with a unique string consisting of the datetime of the crawl + self.version
        hash_temp: str = "This string should consist of a date (publication date, preferably)" + self.version
        base.add_value('hash', hash_temp)
        last_modified = None
        base.add_value('lastModified', last_modified)
        # sometimes you might get a "type"-value from the JSON_LD. If it's not supplied by the website you're crawling,
        # you might need to use a constant:
        base.add_value('type', Constants.TYPE_MATERIAL)
        thumbnail_url: str = "This string should hold the thumbnail URL"
        base.add_value('thumbnail', thumbnail_url)

        lom = LomBaseItemloader()
        # TODO: afterwards fill up the LomBaseItem with
        #  - LomGeneralItem                 required
        #  - LomTechnicalItem               required
        #  - LomLifeCycleItem               required (multiple possible)
        #  - LomEducationalItem             required
        #  - LomClassificationItem          optional

        general = LomGeneralItemloader()
        # TODO: fill "general"-keys with values for
        #  - identifier                     required
        #  - title                          required
        #  - keyword                        required
        #  - description                    required
        #  - language                       recommended
        #  - coverage                       optional
        #  - structure                      optional
        #  - aggregationLevel               optional
        # e.g.: the unique identifier might be the URL to a material
        general.add_value('identifier', response.url)
        # TODO: don't forget to add key-value-pairs for 'title', 'keyword' and 'description'!
        # once we've added all available values to the necessary keys in our LomGeneralItemLoader,
        # we call the load_item()-method to return a (now filled) LomGeneralItem to the LomBaseItemLoader
        lom.add_value('general', general.load_item())

        technical = LomTechnicalItemLoader()
        # TODO: fill "technical"-keys with values for
        #  - format                         required (expected: MIME-type, e.g. 'text/html' for web-sites)
        #  - location                       required (expected: URI / URL of a learning object / material)
        #  - size                           optional
        #  - requirement                    optional
        #  - installationRemarks            optional
        #  - otherPlatformRequirements      optional
        #  - duration                       optional (only applies to audiovisual content like videos/podcasts)
        # similar to how the "general"-LomGeneralItemLoader was filled with Items, individual values can be set with
        # technical.add_value('key','value')
        # or replaced with:
        # technical.replace_value('key', 'value')
        technical.add_value(
            'format', 'text/html')  # e.g. if the learning object is a web-page
        technical.add_value(
            'location', response.url
        )  # if the the learning object has a unique URL that's being
        # navigated by the crawler
        lom.add_value('technical', technical.load_item())

        lifecycle = LomLifecycleItemloader()
        # TODO: fill "lifecycle"-keys with values for
        #  - role                           recommended
        #  - firstName                      recommended
        #  - lastName                       recommended
        #  - url                            recommended
        #  - date                           recommended
        #  - organization                   optional
        #  - email                          optional
        #  - uuid                           optional
        lifecycle.add_value(
            'role',
            'author')  # supported roles: "author" / "editor" / "publisher"
        # for available roles mapping, please take a look at converter/es_connector.py
        lom.add_value('lifecycle', lifecycle.load_item())

        educational = LomEducationalItemLoader()
        # TODO: fill "educational"-keys with values for
        #  - description                    recommended (= "Comments on how this learning object is to be used")
        #  - language                       recommended
        #  - interactivityType              optional
        #  - interactivityLevel             optional
        #  - semanticDensity                optional
        #  - typicalAgeRange                optional
        #  - difficulty                     optional
        #  - typicalLearningTime            optional
        lom.add_value('educational', educational.load_item())

        classification = LomClassificationItemLoader()
        # TODO: fill "classification"-keys with values for
        #  - cost                           optional
        #  - purpose                        optional
        #  - taxonPath                      optional
        #  - description                    optional
        #  - keyword                        optional
        lom.add_value('classification', classification.load_item())

        # once you've filled "general", "technical", "lifecycle" and "educational" with values,
        # the LomBaseItem is loaded into the "base"-BaseItemLoader
        base.add_value('lom', lom.load_item())

        vs = ValuespaceItemLoader()
        # for possible values, either consult https://vocabs.openeduhub.de
        # or take a look at https://github.com/openeduhub/oeh-metadata-vocabs
        # TODO: fill "valuespaces"-keys with values for
        #  - discipline                     recommended
        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/discipline.ttl)
        #  - intendedEndUserRole            recommended
        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/intendedEndUserRole.ttl)
        #  - learningResourceType           recommended
        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/learningResourceType.ttl)
        #  - conditionsOfAccess             recommended
        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/conditionsOfAccess.ttl)
        #  - containsAdvertisement          recommended
        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/containsAdvertisement.ttl)
        #  - price                          recommended
        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/price.ttl)
        #  - educationalContext             optional
        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/educationalContext.ttl)
        #  - sourceContentType              optional
        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/sourceContentType.ttl)
        #  - toolCategory                   optional
        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/toolCategory.ttl)
        #  - accessibilitySummary           optional
        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/accessibilitySummary.ttl)
        #  - dataProtectionConformity       optional
        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/dataProtectionConformity.ttl)
        #  - fskRating                      optional
        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/fskRating.ttl)
        #  - oer                            optional
        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/oer.ttl)
        base.add_value('valuespaces', vs.load_item())

        lic = LicenseItemLoader()
        # TODO: fill "license"-keys with values for
        #  - url                            required
        #  - oer                            recommended ('oer' is automatically set if the 'url'-field above
        #  is recognized in LICENSE_MAPPINGS: for possible url-mapping values, please take a look at
        #  LICENSE_MAPPINGS in converter/constants.py)
        #  - author                         recommended
        #  - internal                       optional
        #  - description                    optional
        #  - expirationDate                 optional (for content that expires, e.g. ÖR-Mediatheken)
        base.add_value('license', lic.load_item())

        # Either fill the PermissionItemLoader manually (not necessary most of the times)
        permissions = PermissionItemLoader()
        # or (preferably) call the inherited getPermissions(response)-method
        #   from converter/spiders/base_classes/lom_base.py by using super().:
        # permissions = super().getPermissions(response)
        # TODO: if necessary, add/replace values for the following "permissions"-keys
        #  - public                         optional
        #  - groups                         optional
        #  - mediacenters                   optional
        #  - autoCreateGroups               optional
        #  - autoCreateMediacenters         optional
        base.add_value('permissions', permissions.load_item())

        # Either fill the ResponseItemLoader manually (not necessary most of the time)
        response_loader = ResponseItemLoader()
        # or (preferably) call the inherited mapResponse(response)-method
        #   from converter/spiders/base_classes/lom_base.py by using super().:
        # response_loader = super().mapResponse(response)
        # TODO: if necessary, add/replace values for the following "response"-keys
        #  - url                            required
        #  - status                         optional
        #  - html                           optional
        #  - text                           optional
        #  - headers                        optional
        #  - cookies                        optional
        #  - har                            optional
        base.add_value('response', response_loader.load_item())

        # once all scrapy.Item are loaded into our "base", we yield the BaseItem by calling the .load_item() method
        yield base.load_item()
    def parse(self, response: scrapy.http.Response, **kwargs):
        """
        Parses an individual 'worksheet' and combines the metadata with data from its 'bundle'-dictionary.

        Spider Contracts:
        @url https://editor.mnweg.org/mnw/dokument/vocabulary-around-the-world-3
        @returns items 1

        :return: yields a BaseItemLoader
        """
        # since we're only parsing the first worksheet for some additional metadata, the metadata object will be
        # centered around a bundle, not the individual pages

        # print("DEBUG parse_worksheet_page", response.url)
        date_published = response.xpath(
            '//ul[@class="meta"]/li[3]/text()').get()

        base = BaseItemLoader()
        base.add_value("sourceId", kwargs.get('bundle_url'))
        hash_temp = str(date_published + self.version)
        base.add_value("hash", hash_temp)
        # this is a hacky solution: the thumbnail is the miniature preview of the bundle's first worksheet
        bundle_thumbnail = kwargs.get('bundle_thumbnail')
        if bundle_thumbnail is not None:
            base.add_value('thumbnail', bundle_thumbnail)
        base.add_value('type', Constants.TYPE_MATERIAL)
        base.add_value('lastModified', date_published)

        lom = LomBaseItemloader()
        general = LomGeneralItemloader()
        general.add_value('title', kwargs.get('bundle_title'))

        description_temp = str()
        bundle_desc_temp = kwargs.get('bundle_description')
        worksheet_desc_temp = kwargs.get('worksheet_description_summary')
        # not every bundle has a description, but there's always worksheet descriptions available:
        if bundle_desc_temp is not None:
            description_temp: str = bundle_desc_temp + "\n\n" + worksheet_desc_temp
        elif bundle_desc_temp is None and worksheet_desc_temp is not None:
            description_temp: str = worksheet_desc_temp
        # print(description_temp)
        general.add_value('description', description_temp)
        general.add_value('language', 'de')
        general.add_value('identifier', kwargs.get('bundle_url'))
        lom.add_value('general', general.load_item())

        technical = LomTechnicalItemLoader()
        technical.add_value("format", "text/html")
        technical.add_value('location', kwargs.get('bundle_url'))
        lom.add_value('technical', technical.load_item())

        lifecycle = LomLifecycleItemloader()
        bundle_organization: dict = kwargs.get('bundle_ld_json_organization')
        # the dictionary that we can parse from the website itself looks like this:
        # 'organization': {'@context': 'http://schema.org',
        #                   '@type': 'Organization',
        #                   'name': 'Materialnetzwerk e. G.',
        #                   'sameAs': ['http://twitter.com/materialnw',
        #                              'https://www.facebook.com/materialnetzwerk'],
        #                   'url': 'https://editor.mnweg.org'}}
        # TODO: once its possible to parse a 'organization'-schema-type as a dictionary by the back-end, use
        #   lifecycle.add_value('organization', bundle_organization)
        if bundle_organization is not None:
            lifecycle.add_value('organization',
                                bundle_organization.get("name"))
            lifecycle.add_value('url', bundle_organization.get("url"))
        lifecycle.add_value('date', date_published)
        lom.add_value('lifecycle', lifecycle.load_item())

        educational = LomEducationalItemLoader()
        # TODO: educationalLevel is currently unsupported in the items.py backend?
        educational_level = kwargs.get('bundle_educational_level')
        if educational_level is not None:
            educational.add_value('educationalLevel', educational_level)
        lom.add_value('educational', educational.load_item())
        base.add_value('lom', lom.load_item())

        vs = ValuespaceItemLoader()
        vs.add_value('learningResourceType', 'teaching module')
        bundle_discipline = kwargs.get('bundle_discipline')
        if bundle_discipline is not None:
            if self.discipline_mapping.get(bundle_discipline) is not None:
                bundle_discipline = self.discipline_mapping.get(
                    bundle_discipline)
            vs.add_value('discipline', bundle_discipline)
        vs.add_value('intendedEndUserRole', 'teacher')
        #  logged in users can manipulate the worksheets and fit them to their needs,
        #  but there's no login required for just downloading the pdf of an available worksheet
        vs.add_value('conditionsOfAccess',
                     "login required for additional features")
        vs.add_value('price', 'no')
        # we can map "Phase" to our educationalContext with the following ValuespaceHelper method:
        if educational_level is not None:
            vs.add_value(
                "educationalContext",
                ValuespaceHelper.educationalContextByGrade(educational_level))

        lic = LicenseItemLoader()
        # everything is CC-BY-SA 3.0 according to the FAQs: https://mnweg.org/faqs
        lic.add_value('url', Constants.LICENSE_CC_BY_SA_30)
        base.add_value('license', lic.load_item())

        response_loader = ResponseItemLoader()
        response_loader.add_value('url', kwargs.get('bundle_url'))

        base.add_value('valuespaces', vs.load_item())
        base.add_value('response', response_loader.load_item())

        yield base.load_item()
Exemple #9
0
    def parse_site(self,
                   response: scrapy.http.HtmlResponse,
                   sitemap_entry: SitemapEntry = None):
        # extract the jsonld
        data = jslde.extract(response.text)[0]
        response.meta['sitemap_entry'] = sitemap_entry
        base = super().getBase(response=response)
        base.add_value("response", super().mapResponse(response).load_item())
        # we assume that content is imported. Please use replace_value if you import something different
        base.add_value("type", Constants.TYPE_MATERIAL)
        base.add_value('thumbnail', data.get("thumbnailUrl", None))
        base.add_value('lastModified', data.get("dateModified", None))
        for publisher in data.get("publisher", []):
            # TODO add type, e.g. organization
            base.add_value("publisher", publisher.get("name"))

        lom = LomBaseItemloader()
        general = LomGeneralItemloader(response=response)
        general.add_value('title', data.get("name", None))
        general.add_value('description', data.get("description", None))
        general.add_value("identifier", data.get("identifier", None))
        for language in data.get("language", []):
            general.add_value("language", language)
        lom.add_value("general", general.load_item())

        technical = LomTechnicalItemLoader()
        technical.add_value('format', 'text/html')
        technical.add_value('location', sitemap_entry.loc)
        lom.add_value("technical", technical.load_item())

        lifecycle = LomLifecycleItemloader()
        lom.add_value("lifecycle", lifecycle.load_item())
        edu = LomEducationalItemLoader()
        lom.add_value("educational", edu.load_item())
        # classification = LomClassificationItemLoader()
        # lom.add_value("classification", classification.load_item())
        base.add_value("lom", lom.load_item())

        vs = ValuespaceItemLoader()
        for audience in data.get("audience", []):
            vs.add_value("intendedEndUserRole", audience)

        for discipline in (d.strip()
                           for d in data.get("about", []).split(",")):
            if discipline in about_maps.keys():
                discipline = about_maps[discipline]
            vs.add_value('discipline', discipline)

        for lrt in data.get("type", []):
            vs.add_value('learningResourceType', lrt)
        base.add_value("valuespaces", vs.load_item())

        lic = LicenseItemLoader()
        lic.add_value('url', data.get("license", None))
        for creator in data.get("creator", []):
            lic.add_value("author", creator.get("name", ""))

        base.add_value("license", lic.load_item())

        permissions = super().getPermissions(response)

        base.add_value("permissions", permissions.load_item())
        response_loader = ResponseItemLoader()
        response_loader.add_value('url', response.url)
        base.add_value("response", response_loader.load_item())
        yield base.load_item()
Exemple #10
0
    def parse(self, response: scrapy.http.Response, **kwargs):
        """
        Parses an individual topic url for metadata and yields a BaseItem.

        Scrapy Contracts:
        @url https://www.umwelt-im-unterricht.de/hintergrund/generationengerechtigkeit-klimaschutz-und-eine-lebenswerte-zukunft/
        @returns item 1
        """
        current_url: str = response.url
        base = BaseItemLoader()

        base.add_value('sourceId', response.url)
        date_raw: str = response.xpath(
            '//div[@class="b-cpsuiu-show-info"]/span/text()').get()
        date_cleaned_up: str = w3lib.html.strip_html5_whitespace(date_raw)
        hash_temp = str(date_cleaned_up + self.version)
        base.add_value('hash', hash_temp)
        base.add_value('lastModified', date_cleaned_up)
        base.add_value('type', Constants.TYPE_MATERIAL)
        # base.add_value('thumbnail', thumbnail_url)

        lom = LomBaseItemloader()

        general = LomGeneralItemloader()
        general.add_value('identifier', response.url)
        title: str = response.xpath(
            '//div[@class="tx-cps-uiu"]/article/h1/text()').get()
        general.add_value('title', title)
        keywords: list = response.xpath(
            '//div[@class="b-cpsuiu-show-keywords"]/ul/li/a/text()').getall()
        if len(keywords) >= 1:
            # only add keywords if the list isn't empty
            general.add_value('keyword', keywords)
        description: str = response.xpath(
            '/html/head/meta[@name="description"]/@content').get()
        general.add_value('description', description)
        general.add_value('language', 'de')

        lom.add_value('general', general.load_item())

        technical = LomTechnicalItemLoader()
        technical.add_value('format', 'text/html')
        technical.add_value('location', response.url)
        lom.add_value('technical', technical.load_item())

        lifecycle = LomLifecycleItemloader()
        lifecycle.add_value('role', 'publisher')
        lifecycle.add_value('date', date_cleaned_up)
        lifecycle.add_value('url',
                            "https://www.umwelt-im-unterricht.de/impressum/")
        lifecycle.add_value(
            'organization',
            'Bundesministerium für Umwelt, Naturschutz und nukleare Sicherheit (BMU)'
        )
        lom.add_value('lifecycle', lifecycle.load_item())

        educational = LomEducationalItemLoader()
        educational.add_value('language', 'de')

        # TODO: a didactic comment could fit into either one of these:
        #  - educational.description
        #  - classification.description (with classification.purpose set to 'educational objective')
        if "/wochenthemen/" in current_url:
            # didactic comments are only part of "Thema der Woche"
            didactic_comment = response.xpath(
                '//div[@class="c-collapse-content js-collapse-content"]').get(
                )
            if didactic_comment is not None:
                didactic_comment = w3lib.html.remove_tags(didactic_comment)
                # didactic_comment = w3lib.html.replace_escape_chars(didactic_comment, which_ones='\t', replace_by=" ")
                # didactic_comment = w3lib.html.replace_escape_chars(didactic_comment)
                didactic_comment = " ".join(didactic_comment.split())
                if didactic_comment.endswith("mehr lesenweniger lesen"):
                    # the button-description of the expandable info-box ends up in the string,
                    # therefore we are manually removing it:
                    didactic_comment = didactic_comment.replace(
                        "mehr lesenweniger lesen", "")
                # since there's currently no way to confirm how the string looks in the web-interface:
                # ToDo: make sure which string format looks best in edu-sharing (cleaned up <-> with escape chars)
                educational.add_value('description', didactic_comment)

        lom.add_value('educational', educational.load_item())

        classification = LomClassificationItemLoader()
        if "/unterrichtsvorschlaege/" in current_url:
            classification.add_value('purpose', 'competency')
            competency_description: list = response.xpath(
                '//div[@class="b-cpsuiu-show-description"]/*[not('
                '@class="cc-licence-info")]').getall()
            # the xpath-expression for competency_description will grab the whole div-element,
            # but EXCLUDE the "license"-container (if the license-description exists, it's always part of the same div)
            if len(competency_description) >= 1:
                # only if the list of strings is not empty, we'll try to type-convert it to a string (and clean its
                # formatting up)
                competency_description: str = " ".join(competency_description)
                competency_description = w3lib.html.remove_tags(
                    competency_description)
                classification.add_value('description', competency_description)

        lom.add_value('classification', classification.load_item())
        base.add_value('lom', lom.load_item())

        vs = ValuespaceItemLoader()

        # depending on the website-category, we need to set a specific learningResourceType
        # because the value 'website' for all crawled items would not be helpful enough
        if "/wochenthemen/" in current_url or "/unterrichtsvorschlaege/" in current_url:
            vs.add_value('learningResourceType', 'lesson plan')
        if "/hintergrund/" in current_url:
            vs.add_value('learningResourceType', 'Text')
        if "/medien/dateien/" in current_url:
            # topics categorized as "Arbeitsmaterial" offer customizable worksheets to teachers
            vs.add_value('learningResourceType', 'worksheet')
        if "/medien/videos/" in current_url:
            vs.add_value('learningResourceType', 'video')
        if "/medien/bilder/" in current_url:
            # topics categorized as "Bilderserie" hold several images in a gallery (with individual licenses)
            vs.add_value('learningResourceType', 'image')

        vs.add_value('price', 'no')
        vs.add_value('containsAdvertisement', 'no')
        vs.add_value('conditionsOfAccess', 'no login')
        vs.add_value('intendedEndUserRole', 'teacher')
        # see: https://www.umwelt-im-unterricht.de/ueber-umwelt-im-unterricht/
        vs.add_value('accessibilitySummary', 'Not tested')
        # see: https://www.umwelt-im-unterricht.de/erklaerung-zur-barrierefreiheit/
        vs.add_value('dataProtectionConformity', 'Sensible data collection')
        # see: https://www.umwelt-im-unterricht.de/datenschutz/

        disciplines_raw: list = response.xpath(
            '//div[@class="b-cpsuiu-show-subjects"]/ul/li/a/text()').getall()
        if len(disciplines_raw) >= 1:
            disciplines = list()
            for discipline_value in disciplines_raw:
                # self.debug_discipline_values.add(discipline_value)
                if discipline_value in self.DISCIPLINE_MAPPING.keys():
                    discipline_value = self.DISCIPLINE_MAPPING.get(
                        discipline_value)
                # since the mapping value can either be a single string OR a list of strings, we need to make sure that
                # our 'disciplines'-list is a list of strings (not a list with nested lists):
                if type(discipline_value) is list:
                    disciplines.extend(discipline_value)
                else:
                    disciplines.append(discipline_value)
            if len(disciplines) >= 1:
                vs.add_value('discipline', disciplines)

        educational_context_raw = response.xpath(
            '//div[@class="b-cpsuiu-show-targets"]/ul/li/a/text()').getall()
        if len(educational_context_raw) >= 1:
            # the educationalContext-mapping is only done when there's at least one educational_context found
            educational_context = list()
            for educational_context_value in educational_context_raw:
                # self.debug_educational_context_values.add(educational_context_value)
                if educational_context_value in self.EDUCATIONAL_CONTEXT_MAPPING.keys(
                ):
                    educational_context_value = self.EDUCATIONAL_CONTEXT_MAPPING.get(
                        educational_context_value)
                if type(educational_context_value) is list:
                    educational_context.extend(educational_context_value)
                else:
                    educational_context.append(educational_context_value)
            if len(educational_context) >= 1:
                vs.add_value('educationalContext', educational_context)

        base.add_value('valuespaces', vs.load_item())

        lic = LicenseItemLoader()
        license_url: str = response.xpath(
            '//div[@class="cc-licence-info"]/p/a[@rel="license"]/@href').get()
        if license_url is not None:
            if license_url.startswith("http://"):
                # the license-mapper expects urls that are in https:// format, but UIU uses http:// links to CC-licenses
                license_url = license_url.replace("http://", "https://")
            lic.add_value('url', license_url)

        license_description_raw: str = response.xpath(
            '//div[@class="cc-licence-info"]').get()
        if license_description_raw is not None:
            license_description_raw = w3lib.html.remove_tags(
                license_description_raw)
            license_description_raw = w3lib.html.replace_escape_chars(
                license_description_raw, which_ones="\n", replace_by=" ")
            # if we would replace_escape_chars() straight away, there would be words stuck together that don't belong
            # together. just replacing \n with a whitespace is enough to keep the structure of the string intact.
            license_description_raw = w3lib.html.replace_escape_chars(
                license_description_raw)
            license_description = " ".join(license_description_raw.split())
            # making sure that there's only 1 whitespace between words
            lic.add_value('description', license_description)
        base.add_value('license', lic.load_item())

        permissions = super().getPermissions(response)
        base.add_value('permissions', permissions.load_item())

        response_loader = super().mapResponse(response)
        base.add_value('response', response_loader.load_item())

        yield base.load_item()