class uol(): headers = util.getHeaders() results = list() finalResults = list() counter = 1 def search(self,query,pageId=1,verbose=False): self.verbose = verbose self.query = query self.headers['Referer']='https://busca.uol.com.br/result.html?term=%s' % query url = "https://busca.uol.com.br/search?client=uol&term=%s&page=%d" % (query,pageId) r = requests.get(url,headers=self.headers) self.searchParser(r.json()) if self.finalResults: return self.finalResults def searchParser(self,content): if content['results']: if self.verbose: print 'UOL - Searching page %d' % self.counter for response in content['results']: self.results.append(response['url']) self.counter = self.counter + 1 self.search(self.query,self.counter,verbose=self.verbose) else: self.finalResults = self.results
class ask(): headers = util.getHeaders() results = list() finalResults = list() counter = 1 def search(self,query,pageId=1,verbose=False): self.verbose = verbose url = "https://www.search.ask.com/web?q=%s&o=&tpr=10&page=%d" % (query,pageId) r = requests.get(url,headers=self.headers) self.searchParser(r.content) if self.finalResults: return self.finalResults def searchParser(self,content): tree = html.fromstring(content) urls = tree.xpath('//*[@id="algo-container"]/ol/li/div/a/@href') nextPage = tree.xpath('//*[@id="pagination-nav-block"]/div/a[2]/@href') if self.verbose: print 'Ask - Searching page %d' % self.counter self.counter = self.counter + 1 for url in urls: self.results.append(url) if len(nextPage) != 0: if len(nextPage) == 2 or self.counter - 1 == 1: self.checkNextPage(nextPage) else: self.finalResults = self.results else: self.finalResults = self.results def checkNextPage(self,nextPage): query,pageId = nextPage[-1].split("&")[2].split("=")[1],nextPage[-1].split("&")[3].split("=")[1] self.search(query,int(pageId),verbose=self.verbose)
class yahoo(): headers = util.getHeaders() results = list() finalResults = list() counter = 1 def search(self,query=None,url=None,verbose=False): self.verbose = verbose url = url if url else 'https://br.search.yahoo.com/search?&p=%s' % query r = requests.get(url,headers=self.headers) self.searchParser(r.content) if self.finalResults: return self.finalResults def searchParser(self,content): tree = html.fromstring(content) urls = tree.xpath('//*[@class=" td-u"]/@href') nextPage = tree.xpath('//*[@class="next"]/@href') if self.verbose: print 'Yahoo - Searching page %d' % self.counter self.counter = self.counter + 1 for url in urls: self.results.append(unquote(url).split("RU=")[1].split("/RK=")[0]) if nextPage: self.checkNextPage(nextPage) else: self.finalResults = self.results def checkNextPage(self,nextPage): self.search(url=nextPage[0],verbose=self.verbose)
class bing(): headers = util.getHeaders() results = list() finalResults = list() counter = 1 def search(self, query, verbose=False): self.verbose = verbose url = 'https://www.bing.com/search?q=%s' % query r = requests.get(url, headers=self.headers) self.searchParser(r.content) if self.finalResults: return self.finalResults def searchParser(self, content): tree = html.fromstring(content) urls = tree.xpath('//*[@id="b_results"]/li/h2/a/@href') nextPage = tree.xpath( '//*[@id="b_results"]/li[11]/nav/ul/li[6]/a/@href') if self.verbose: print 'Bing - Searching page %d' % self.counter self.counter = self.counter + 1 for url in urls: self.results.append(url) if nextPage: self.checkNextPage(nextPage) else: self.finalResults = self.results def checkNextPage(self, nextPage): query = nextPage[0].split("=", 1)[1] self.search(query, verbose=self.verbose)
class duckDuckGo(): headers = util.getHeaders() results = list() finalResults = list() counter = 1 def search(self, query, data=None, verbose=False): self.verbose = verbose self.query = query request = requests.post if data else requests.get url = "https://duckduckgo.com/html/?q=%s&s=0" % (query) r = request(url, headers=self.headers, data=data) self.searchParser(r.content) if self.finalResults: return self.finalResults def searchParser(self, content): tree = html.fromstring(content) urls = tree.xpath('//*[@id="links"]/div/div/h2/a/@href') nextPage = tree.xpath('//div[@class="nav-link"]/form') if self.verbose: print 'Duck Duck Go - Searching page %d' % self.counter self.counter = self.counter + 1 for url in urls: self.results.append(url) self.checkNextPage(nextPage) def checkNextPage(self, nextPage): if nextPage: dataNetxPage = dict() for form in nextPage: for field in form.getchildren(): dataNetxPage[field.get('name')] = field.get('value') self.search(self.query, data=dataNetxPage, verbose=self.verbose) else: self.finalResults = self.results
class className(): headers = util.getHeaders() results = list() finalResults = list() counter = 1 def search(self, query, verbose=False): self.verbose = verbose url = "http://url.goes.here/search?p=%s" % query r = requests.get(url, headers=self.headers) self.searchParser(r.content) if self.finalResults: return self.finalResults def searchParser(self, content): tree = html.fromstring(content) urls = tree.xpath( '') # You can get it using the google chrome developer tool nextPage = tree.xpath( '') # You can get it using the google chrome developer tool if self.verbose: print 'Search Engine Name - Searching page %d' % self.counter self.counter = self.counter + 1 for url in urls: self.results.append(url) if nextPage: #or any logic you need self.checkNextPage( nextPage) #Pass the new link or data to call search again else: self.finalResults = self.results # Return the urls def checkNextPage(self, nextPage): #You code goes here self.search(query, verbose=self.verbose)