def emailtrack(self, response): driver = response.meta['driver'] html = driver.page_source response_obj = Selector(text=html) page = response.meta['page'] category = response.meta['category'] index = response.meta['index'] find = response.meta['find'] near = response.meta['near'] catg = response.meta['catg'] duplicateurl = response.meta['duplicateurl'] links = LxmlLinkExtractor(allow=()).extract_links(response) Finallinks = [str(link.url) for link in links] links = [] for link in Finallinks: if ('Contact' in link or 'contact' in link or 'About' in link or 'about' in link or 'home' in link or 'Home' in link or 'HOME' in link or 'CONTACT' in link or 'ABOUT' in link): links.append(link) links.append(str(response.url)) if (len(links) > 0): l = links[0] links.pop(0) uniqueemail = set() yield SeleniumRequest(url=l, wait_time=1000, screenshot=True, callback=self.finalemail, dont_filter=True, meta={ 'links': links, 'page': page, 'category': category, 'index': index, 'find': find, 'near': near, 'catg': catg, 'duplicateurl': duplicateurl, 'uniqueemail': uniqueemail }) else: yield SeleniumRequest(url='https://www.google.com/', wait_time=1000, screenshot=True, callback=self.scrapepages, dont_filter=True, meta={ 'page': page, 'category': category, 'index': index, 'find': find, 'near': near, 'catg': catg, 'duplicateurl': duplicateurl })
def emailtrack(self, response): driver = response.meta['driver'] index = response.meta['index'] web_name = response.meta['web_name'] web_type = response.meta['web_type'] html = driver.page_source response_obj = Selector(text=html) links = LxmlLinkExtractor(allow=()).extract_links(response) Finallinks = [str(link.url) for link in links] links = [] for link in Finallinks: if ('Contact' in link or 'contact' in link or 'About' in link or 'about' in link or 'CONTACT' in link or 'ABOUT' in link): links.append(link) links.append(str(response.url)) if (len(links) > 0): l = links[0] links.pop(0) uniqueemail = set() yield SeleniumRequest(url=l, wait_time=1000, screenshot=True, callback=self.finalemail, errback=self.errback_finalemail, meta={ 'index': index, 'web_name': web_name, 'web_type': web_type, 'uniqueemail': uniqueemail, 'links': links }, dont_filter=True) else: finalemail = [] yield SeleniumRequest(url='https://www.google.com/', wait_time=1000, screenshot=True, callback=self.parse_page, errback=self.errback_google, meta={ 'index': index, 'web_name': web_name, 'web_type': web_type, 'finalemail': finalemail, 'links': links }, dont_filter=True)
def emailtrack(self, response): driver = response.meta['driver'] html = driver.page_source response_obj = Selector(text=html) page = response.meta['page'] # category = response.meta['category'] index = response.meta['index'] find = response.meta['find'] near = response.meta['near'] # catg = response.meta['catg'] # duplicateurl = response.meta['duplicateurl'] links = LxmlLinkExtractor(allow=()).extract_links(response) Finallinks = [str(link.url) for link in links] linkscheck = [] for link in Finallinks: if ( 'Contact' in link or 'contact' in link or 'About' in link or 'about' in link or 'CONTACT' in link or 'ABOUT' in link): linkscheck.append(link) links=[] for link in linkscheck: if('facebook' not in link and 'instagram' not in link and 'youtube' not in link and 'twitter' not in link and 'wiki' not in link and 'linkedin' not in link): links.append(link) links.append(str(response.url)) if (len(links) > 0): l = links[0] links.pop(0) uniqueemail = set() yield SeleniumRequest( url=l, wait_time=1000, screenshot=True, callback=self.finalemail, errback=self.errback_finalemail, dont_filter=True, meta={'links': links, 'page': page, 'index': index, 'find': find, 'near': near, 'uniqueemail': uniqueemail} ) else: finalemail=[] driver = response.meta['driver'] yield SeleniumRequest( url = driver.current_url, wait_time=1000, screenshot=True, callback=self.data_save, errback=self.error_google, dont_filter=True, meta={'page': page, 'index': index, 'find': find, 'near': near, 'finalemail': finalemail} )