Python LxmlLinkExtractor.appendの例

プログラミング言語: Python

名前空間/パッケージ名: scrapy.linkextractors.lxmlhtml

クラス/型: LxmlLinkExtractor

メソッド/関数: append

hotexamples.comのコード掲載数: 6

Python LxmlLinkExtractor.append - 6件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのscrapy.linkextractors.lxmlhtml.LxmlLinkExtractor.appendの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

LxmlLinkExtractor(30)

extract_links(15)

append(6)

pop(3)

コード例 #1

ファイルを表示

ファイル: scrap.py プロジェクト: VickyMutai/email-finder

    def parse(self, response):
        links = LxmlLinkExtractor(allow=()).extract_links(response)
        links = [str(link.url) for link in links]
        links.append(str(response.url))

        for link in links:
            yield scrapy.Request(url=link, callback=self.parse_link)

コード例 #2

ファイルを表示

 def parse(self, response):
     url_to_follow = LxmlLinkExtractor(allow=()).extract_links(response)
     url_to_follow = [str(link.url) for link in url_to_follow]
     url_to_follow.append(str(response.url))
     for url in url_to_follow:
         yield scrapy.Request(url=url,
                              callback=self.parse_email,
                              dont_filter=True)

コード例 #3

ファイルを表示

    def emailtrack(self, response):
        driver = response.meta['driver']
        html = driver.page_source
        response_obj = Selector(text=html)
        page = response.meta['page']
        category = response.meta['category']
        index = response.meta['index']
        find = response.meta['find']
        near = response.meta['near']
        catg = response.meta['catg']
        duplicateurl = response.meta['duplicateurl']
        links = LxmlLinkExtractor(allow=()).extract_links(response)
        Finallinks = [str(link.url) for link in links]
        links = []
        for link in Finallinks:
            if ('Contact' in link or 'contact' in link or 'About' in link
                    or 'about' in link or 'home' in link or 'Home' in link
                    or 'HOME' in link or 'CONTACT' in link or 'ABOUT' in link):
                links.append(link)

        links.append(str(response.url))

        if (len(links) > 0):
            l = links[0]
            links.pop(0)
            uniqueemail = set()
            yield SeleniumRequest(url=l,
                                  wait_time=1000,
                                  screenshot=True,
                                  callback=self.finalemail,
                                  dont_filter=True,
                                  meta={
                                      'links': links,
                                      'page': page,
                                      'category': category,
                                      'index': index,
                                      'find': find,
                                      'near': near,
                                      'catg': catg,
                                      'duplicateurl': duplicateurl,
                                      'uniqueemail': uniqueemail
                                  })
        else:
            yield SeleniumRequest(url='https://www.google.com/',
                                  wait_time=1000,
                                  screenshot=True,
                                  callback=self.scrapepages,
                                  dont_filter=True,
                                  meta={
                                      'page': page,
                                      'category': category,
                                      'index': index,
                                      'find': find,
                                      'near': near,
                                      'catg': catg,
                                      'duplicateurl': duplicateurl
                                  })

コード例 #4

ファイルを表示

    def emailtrack(self, response):
        driver = response.meta['driver']
        index = response.meta['index']
        web_name = response.meta['web_name']
        web_type = response.meta['web_type']

        html = driver.page_source
        response_obj = Selector(text=html)

        links = LxmlLinkExtractor(allow=()).extract_links(response)
        Finallinks = [str(link.url) for link in links]
        links = []
        for link in Finallinks:
            if ('Contact' in link or 'contact' in link or 'About' in link
                    or 'about' in link or 'CONTACT' in link
                    or 'ABOUT' in link):
                links.append(link)

        links.append(str(response.url))

        if (len(links) > 0):
            l = links[0]
            links.pop(0)
            uniqueemail = set()

            yield SeleniumRequest(url=l,
                                  wait_time=1000,
                                  screenshot=True,
                                  callback=self.finalemail,
                                  errback=self.errback_finalemail,
                                  meta={
                                      'index': index,
                                      'web_name': web_name,
                                      'web_type': web_type,
                                      'uniqueemail': uniqueemail,
                                      'links': links
                                  },
                                  dont_filter=True)
        else:
            finalemail = []
            yield SeleniumRequest(url='https://www.google.com/',
                                  wait_time=1000,
                                  screenshot=True,
                                  callback=self.parse_page,
                                  errback=self.errback_google,
                                  meta={
                                      'index': index,
                                      'web_name': web_name,
                                      'web_type': web_type,
                                      'finalemail': finalemail,
                                      'links': links
                                  },
                                  dont_filter=True)

コード例 #5

ファイルを表示

ファイル: yelpspider.py プロジェクト: vaibhav89000/sponsored-yelp

    def emailtrack(self, response):
        driver = response.meta['driver']
        html = driver.page_source
        response_obj = Selector(text=html)
        page = response.meta['page']
        # category = response.meta['category']
        index = response.meta['index']
        find = response.meta['find']
        near = response.meta['near']
        # catg = response.meta['catg']
        # duplicateurl = response.meta['duplicateurl']
        links = LxmlLinkExtractor(allow=()).extract_links(response)
        Finallinks = [str(link.url) for link in links]
        linkscheck = []
        for link in Finallinks:
            if (
                    'Contact' in link or 'contact' in link or 'About' in link or 'about' in link  or 'CONTACT' in link or 'ABOUT' in link):
                linkscheck.append(link)

        links=[]
        for link in linkscheck:
            if('facebook' not in link and 'instagram' not in link and 'youtube' not in link and 'twitter' not in link and 'wiki' not in link and 'linkedin' not in link):
                links.append(link)
        links.append(str(response.url))

        if (len(links) > 0):
            l = links[0]
            links.pop(0)
            uniqueemail = set()
            yield SeleniumRequest(
                url=l,
                wait_time=1000,
                screenshot=True,
                callback=self.finalemail,
                errback=self.errback_finalemail,
                dont_filter=True,
                meta={'links': links, 'page': page, 'index': index, 'find': find, 'near': near,
                      'uniqueemail': uniqueemail}
            )
        else:
            finalemail=[]
            driver = response.meta['driver']
            yield SeleniumRequest(
                url = driver.current_url,
                wait_time=1000,
                screenshot=True,
                callback=self.data_save,
                errback=self.error_google,
                dont_filter=True,
                meta={'page': page, 'index': index, 'find': find, 'near': near, 'finalemail': finalemail}
            )

コード例 #6

ファイルを表示

    def parse(self, response):

        links = LxmlLinkExtractor(allow=(),
                                  deny_domains=[
                                      'https://www.booking.com',
                                      'https://www.hotels.com',
                                      'https://www.telegraph.co.uk',
                                      'https://www.agoda.com'
                                  ]).extract_links(response)
        links = [str(link.url) for link in links]
        links.append(str(response.url))

        for link in links:
            yield scrapy.Request(url=link, callback=self.parse_link)