Esempio n. 1
0
    def parse(self, response):
        links = LxmlLinkExtractor(allow=()).extract_links(response)
        links = [str(link.url) for link in links]
        links.append(str(response.url))

        for link in links:
            yield scrapy.Request(url=link, callback=self.parse_link)
Esempio n. 2
0
 def parse(self, response):
     url_to_follow = LxmlLinkExtractor(allow=()).extract_links(response)
     url_to_follow = [str(link.url) for link in url_to_follow]
     url_to_follow.append(str(response.url))
     for url in url_to_follow:
         yield scrapy.Request(url=url,
                              callback=self.parse_email,
                              dont_filter=True)
Esempio n. 3
0
    def emailtrack(self, response):
        driver = response.meta['driver']
        html = driver.page_source
        response_obj = Selector(text=html)
        page = response.meta['page']
        category = response.meta['category']
        index = response.meta['index']
        find = response.meta['find']
        near = response.meta['near']
        catg = response.meta['catg']
        duplicateurl = response.meta['duplicateurl']
        links = LxmlLinkExtractor(allow=()).extract_links(response)
        Finallinks = [str(link.url) for link in links]
        links = []
        for link in Finallinks:
            if ('Contact' in link or 'contact' in link or 'About' in link
                    or 'about' in link or 'home' in link or 'Home' in link
                    or 'HOME' in link or 'CONTACT' in link or 'ABOUT' in link):
                links.append(link)

        links.append(str(response.url))

        if (len(links) > 0):
            l = links[0]
            links.pop(0)
            uniqueemail = set()
            yield SeleniumRequest(url=l,
                                  wait_time=1000,
                                  screenshot=True,
                                  callback=self.finalemail,
                                  dont_filter=True,
                                  meta={
                                      'links': links,
                                      'page': page,
                                      'category': category,
                                      'index': index,
                                      'find': find,
                                      'near': near,
                                      'catg': catg,
                                      'duplicateurl': duplicateurl,
                                      'uniqueemail': uniqueemail
                                  })
        else:
            yield SeleniumRequest(url='https://www.google.com/',
                                  wait_time=1000,
                                  screenshot=True,
                                  callback=self.scrapepages,
                                  dont_filter=True,
                                  meta={
                                      'page': page,
                                      'category': category,
                                      'index': index,
                                      'find': find,
                                      'near': near,
                                      'catg': catg,
                                      'duplicateurl': duplicateurl
                                  })
Esempio n. 4
0
    def emailtrack(self, response):
        driver = response.meta['driver']
        index = response.meta['index']
        web_name = response.meta['web_name']
        web_type = response.meta['web_type']

        html = driver.page_source
        response_obj = Selector(text=html)

        links = LxmlLinkExtractor(allow=()).extract_links(response)
        Finallinks = [str(link.url) for link in links]
        links = []
        for link in Finallinks:
            if ('Contact' in link or 'contact' in link or 'About' in link
                    or 'about' in link or 'CONTACT' in link
                    or 'ABOUT' in link):
                links.append(link)

        links.append(str(response.url))

        if (len(links) > 0):
            l = links[0]
            links.pop(0)
            uniqueemail = set()

            yield SeleniumRequest(url=l,
                                  wait_time=1000,
                                  screenshot=True,
                                  callback=self.finalemail,
                                  errback=self.errback_finalemail,
                                  meta={
                                      'index': index,
                                      'web_name': web_name,
                                      'web_type': web_type,
                                      'uniqueemail': uniqueemail,
                                      'links': links
                                  },
                                  dont_filter=True)
        else:
            finalemail = []
            yield SeleniumRequest(url='https://www.google.com/',
                                  wait_time=1000,
                                  screenshot=True,
                                  callback=self.parse_page,
                                  errback=self.errback_google,
                                  meta={
                                      'index': index,
                                      'web_name': web_name,
                                      'web_type': web_type,
                                      'finalemail': finalemail,
                                      'links': links
                                  },
                                  dont_filter=True)
Esempio n. 5
0
    def emailtrack(self, response):
        driver = response.meta['driver']
        html = driver.page_source
        response_obj = Selector(text=html)
        page = response.meta['page']
        # category = response.meta['category']
        index = response.meta['index']
        find = response.meta['find']
        near = response.meta['near']
        # catg = response.meta['catg']
        # duplicateurl = response.meta['duplicateurl']
        links = LxmlLinkExtractor(allow=()).extract_links(response)
        Finallinks = [str(link.url) for link in links]
        linkscheck = []
        for link in Finallinks:
            if (
                    'Contact' in link or 'contact' in link or 'About' in link or 'about' in link  or 'CONTACT' in link or 'ABOUT' in link):
                linkscheck.append(link)

        links=[]
        for link in linkscheck:
            if('facebook' not in link and 'instagram' not in link and 'youtube' not in link and 'twitter' not in link and 'wiki' not in link and 'linkedin' not in link):
                links.append(link)
        links.append(str(response.url))

        if (len(links) > 0):
            l = links[0]
            links.pop(0)
            uniqueemail = set()
            yield SeleniumRequest(
                url=l,
                wait_time=1000,
                screenshot=True,
                callback=self.finalemail,
                errback=self.errback_finalemail,
                dont_filter=True,
                meta={'links': links, 'page': page, 'index': index, 'find': find, 'near': near,
                      'uniqueemail': uniqueemail}
            )
        else:
            finalemail=[]
            driver = response.meta['driver']
            yield SeleniumRequest(
                url = driver.current_url,
                wait_time=1000,
                screenshot=True,
                callback=self.data_save,
                errback=self.error_google,
                dont_filter=True,
                meta={'page': page, 'index': index, 'find': find, 'near': near, 'finalemail': finalemail}
            )
Esempio n. 6
0
    def parse(self, response):

        links = LxmlLinkExtractor(allow=(),
                                  deny_domains=[
                                      'https://www.booking.com',
                                      'https://www.hotels.com',
                                      'https://www.telegraph.co.uk',
                                      'https://www.agoda.com'
                                  ]).extract_links(response)
        links = [str(link.url) for link in links]
        links.append(str(response.url))

        for link in links:
            yield scrapy.Request(url=link, callback=self.parse_link)