def extract(self, content): pattern = re.compile( '<cite class=".*?">(.*?{domain})/.*?</cite>'.format( domain=self.target.netloc)) next_page = "下一页" try: links = pattern.findall(content) self.find_new_domain = False for link in links: if not link.startswith('http://') and not link.startswith( 'https://'): link = "http://" + link subdomain = urlparse.urlparse(link).netloc if subdomain != self.target.netloc and subdomain.endswith( self.target.netloc): if subdomain not in self.subdomains: self.logger.info("{engine} Found {subdomain}".format( engine=self.engine_name, subdomain=subdomain)) self.subdomains.update([subdomain]) self.find_new_domain = True except Exception: pass if next_page in content: # tell engine there still be next page return True else: return False
def extract(self, content): next_page = re.compile('<A.*?>\s*<b>Next page</b>\s*</a>') pattern = re.compile( '<a href="http[s]*://(.*{domain}).*?" rel="nofollow">'.format( domain=self.target.netloc)) try: links = pattern.findall(content) self.last_domain = self.target.netloc for link in links: if not link.startswith('http://') and not link.startswith( 'https://'): link = "http://" + link subdomain = urlparse.urlparse(link).netloc if subdomain != self.target.netloc and subdomain.endswith( self.target.netloc): if subdomain not in self.subdomains: self.logger.info("{engine} Found {subdomain}".format( engine=self.engine_name, subdomain=subdomain)) self.subdomains.update([subdomain]) self.last_domain = subdomain except Exception: pass if next_page.findall(content): # tell engine there still be next page return True else: return False
def extract(self, content): pattern = re.compile( '<a href="javascript:" onclick="window.open.*?" target="_blank">(.*?{domain})</a>' .format(domain=self.target.netloc)) next_page = "下一页" try: links = pattern.findall(content) for link in links: if not link.startswith('http://') and not link.startswith( 'https://'): link = "http://" + link subdomain = urlparse.urlparse(link).netloc if subdomain != self.target.netloc and subdomain.endswith( self.target.netloc): if subdomain not in self.subdomains: self.logger.info("{engine} Found {subdomain}".format( engine=self.engine_name, subdomain=subdomain)) self.subdomains.update([subdomain]) except Exception: pass if next_page in content: # tell engine there still be next page return True else: return False
def extract(self, content): next_page = re.compile('<a class="next".*?>Next</a>') pattern = re.compile('<span class=.{1,100}?>(.{0,100}?<b.{0,100}?>'+self.target.netloc+'</b>)') try: links = pattern.findall(content) self.find_new_domain = False for link in links: link = re.sub('<.*?>','',link) if not link.startswith('http://') and not link.startswith('https://'): link = "http://" + link subdomain = urlparse.urlparse(link).netloc if subdomain != self.target.netloc and subdomain.endswith(self.target.netloc): if subdomain not in self.subdomains: self.logger.info( "{engine} Found {subdomain}".format( engine=self.engine_name,subdomain=subdomain)) self.subdomains.update([subdomain]) self.find_new_domain = True except Exception: pass if next_page.findall(content): # tell engine there still be next page return True else: return False
def extract(self, content): next_page = '<li class="PartialWebPagination-next">Next</li>' pattern = re.compile('<p class="PartialSearchResults-item-url">(.*?\.{domain}).*?</p>' .format(domain=self.target.netloc)) try: links = pattern.findall(content) self.find_new_domain = False for link in links: if not link.startswith('http://') and not link.startswith('https://'): link = "http://" + link subdomain = urlparse.urlparse(link).netloc if subdomain != self.target.netloc and subdomain.endswith(self.target.netloc): if subdomain not in self.subdomains: self.logger.info( "{engine} Found {subdomain}".format( engine=self.engine_name,subdomain=subdomain)) self.subdomains.update([subdomain]) self.find_new_domain = True except Exception: pass if next_page in content: # tell engine there still be next page return True else: return False
def extract(self, content): pattern = re.compile('<a.*?class="c-showurl".*?>(.*?{domain})'.format( domain=self.target.netloc)) next_page = re.compile('<a.*?class="n">(.*?)</a>') try: links = pattern.findall(content) self.find_new_domain = False for link in links: link = re.sub('<.*?>|>|<| ', '', link) if not link.startswith('http://') and not link.startswith( 'https://'): link = "http://" + link subdomain = urlparse.urlparse(link).netloc if subdomain != self.target.netloc and subdomain.endswith( self.target.netloc): if subdomain not in self.subdomains: self.logger.info("{engine} Found {subdomain}".format( engine=self.engine_name, subdomain=subdomain)) self.subdomains.update([subdomain]) self.find_new_domain = True except Exception: pass if next_page.findall(content): # tell engine there still be next page return True else: return False