Esempi in Python per Selector.endswith

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: scrapy

Classe/tipologia: Selector

Metodo/funzione: endswith

Esempi su hotexamples.com: 1

Selector.endswith in Python: 1 esempio trovato. Questo è il miglior esempio reale in Python per scrapy.Selector.endswith, estratto da progetti open source. Lo puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

Selector(30)

css(30)

split(30)

xpath(30)

re(24)

extract(22)

replace(11)

strip(9)

__len__(8)

remove_namespaces(7)

startswith(7)

find(6)

select(6)

__contains__(4)

extract_first(3)

index(3)

append(2)

register_namespace(2)

re_first(2)

group(2)

get(2)

findall(2)

endswith(1)

rsplit(1)

json(1)

select_by_visible_text(1)

isdigit(1)

Esempio n. 1

Mostra file

File: zum_physik_apps_spider.py Progetto: openeduhub/oeh-search-etl

    def parse(self, response: scrapy.http.Response, **kwargs):
        """
        Populates a BaseItemLoader with metadata and yields the individual BaseItem via BaseItemLoader.load_item()
        afterwards.

        Scrapy Contracts:
        @url https://www.walter-fendt.de/html5/phde/newtonlaw2_de.htm
        @returns item 1
        """
        # fetching publication date and lastModified from dynamically loaded <p class="Ende">-element:
        url_data_splash_dict = WebTools.getUrlData(response.url,
                                                   engine=WebEngine.Pyppeteer)
        splash_html_string = url_data_splash_dict.get('html')
        page_end_element = Selector(
            text=splash_html_string).xpath('//p[@class="Ende"]').get()
        line_regex = re.compile(r'<br>')
        page_end_string = line_regex.split(page_end_element)
        published_date = None
        last_modified = None
        # the two strings inside the <p>-Container will look like this:
        # Walter Fendt, 2. November 2000
        # Letzte Änderung: 17. Oktober 2017
        # therefore we'll need to extract the dates by splitting up the strings
        for temp_string in page_end_string:
            if temp_string.startswith("Walter Fendt"):
                sentence1 = temp_string.rsplit(', ')
                # each "sentence" list now holds exactly 2 elements, whereby the last element should be the date
                for item in sentence1:
                    if dateparser.parse(item) is not None:
                        published_date = dateparser.parse(item)
            if temp_string.startswith('Letzte Änderung:'):
                sentence2 = temp_string.rsplit(': ')
                for item2 in sentence2:
                    if dateparser.parse(item2) is not None:
                        last_modified = dateparser.parse(item2)

        base = super().getBase(response=response)
        base.add_value('type', Constants.TYPE_MATERIAL)
        if last_modified is not None:
            hash_temp = last_modified.isoformat() + self.version
            base.add_value('hash', hash_temp)
            base.add_value('lastModified', last_modified.isoformat())
        base.add_value('sourceId', response.url)

        lom = LomBaseItemloader()
        general = LomGeneralItemloader(response=response)
        general.add_value('identifier', response.url)
        general.add_value('title',
                          response.xpath('/html/head/title/text()').get())
        general.add_value(
            'description',
            response.xpath(
                '/html/head/meta[@name="description"]/@content').get())
        keywords_string: str = response.xpath(
            '/html/head/meta[@name="keywords"]/@content').get()
        if keywords_string is not None:
            keyword_list = keywords_string.rsplit(", ")
            general.add_value('keyword', keyword_list)
        general.add_value('language', 'de')
        lom.add_value('general', general.load_item())

        technical = LomTechnicalItemLoader()
        technical.add_value('format', "text/html")
        technical.add_value('location', response.url)
        lom.add_value('technical', technical.load_item())

        lifecycle = LomLifecycleItemloader()
        lifecycle.add_value('role', 'author')
        lifecycle.add_value('firstName', 'Walter')
        lifecycle.add_value('lastName', 'Fendt')
        lifecycle.add_value(
            'url', "https://www.walter-fendt.de/wf.htm")  # author information
        if published_date is not None:
            lifecycle.add_value('date', published_date.isoformat())
        lom.add_value('lifecycle', lifecycle.load_item())

        educational = LomEducationalItemLoader()
        educational.add_value('interactivityType', 'mixed')
        lom.add_value('educational', educational.load_item())

        base.add_value('lom', lom.load_item())

        vs = ValuespaceItemLoader()
        vs.add_value('conditionsOfAccess', 'no login')
        vs.add_value('discipline', 'Physik')
        vs.add_value('intendedEndUserRole', ['learner', 'teacher', 'parent'])
        vs.add_value('learningResourceType', ['application', 'web page'])
        vs.add_value('price', 'no')
        base.add_value('valuespaces', vs.load_item())

        lic = LicenseItemLoader()
        lic.add_value('author', 'Walter Fendt')
        # if scrapy could render the <p class="Ende">-element, the license url could be found with the following XPath:
        # license_url = response.xpath('//p[@class="Ende"]/a[@rel="license"]/@href')
        # but since scrapy can't "see" this container, we're extracting the information with scrapy-splash
        license_url = Selector(text=splash_html_string).xpath(
            '//p[@class="Ende"]/a[@rel="license"]/@href').get()
        if license_url is not None:
            # the license url links to the /de/ version, which currently doesn't get mapped properly
            # "https://creativecommons.org/licenses/by-nc-sa/3.0/de/"
            # -> 'https://creativecommons.org/licenses/by-nc-sa/3.0/' is the url-format we want
            if "creativecommons.org/licenses/" in license_url and license_url.endswith(
                    "/de/"):
                license_url = license_url.split("de/")[0]
            lic.add_value('url', license_url)
        base.add_value('license', lic.load_item())

        permissions = super().getPermissions(response)
        base.add_value('permissions', permissions.load_item())
        base.add_value('response', super().mapResponse(response).load_item())

        yield base.load_item()