def get_proxies(n=5): """Read some notoriously known sites and extract some public proxies. Scrapes - http://www.samair.ru/proxy/ The quality of these proxies is probably not worth to be mentioned, but it's nice to test the lack of quality and the behaviour of GoogleScraper. """ r = requests.get('http://www.samair.ru/proxy/') # Try to parse the google HTML result using lxml try: doc = UnicodeDammit(r.text, is_html=True) parser = lxml.html.HTMLParser(encoding=doc.declared_html_encoding) dom = lxml.html.document_fromstring(r.text, parser=parser) dom.resolve_base_href() except Exception as e: print('Some error occurred while lxml tried to parse: {}'.format(e)) table = dom.xpath('//table[@id=\'proxylist\']')[0] for row in table.findall('tr'): print(row.xpath('//td[1]')[0].text_content()) return GoogleScraper.Proxy()
if __name__ == '__main__': # See for the config.cfg file for possible values GoogleScraper.Config.update({ 'SCRAPING': { 'use_own_ip': 'False', 'keyword': 'HelloWorld' }, 'SELENIUM': { 'sel_browser': 'chrome', # change to 'phantomjs' if you want so 'manual_captcha_solving': 'True' } }) # sample proxy proxy = GoogleScraper.Proxy(proto='socks5', host='localhost', port=9050, username='', password='') try: results = GoogleScraper.scrape('Best SEO tool', scrapemethod='sel')#, proxy=proxy) for page in results: for link_title, link_snippet, link_url, *rest in page['results']: # You can access all parts of the search results like that # link_url.scheme => URL scheme specifier (Ex: 'http') # link_url.netloc => Network location part (Ex: 'www.python.org') # link_url.path => URL scheme specifier (Ex: ''help/Python.html'') # link_url.params => Parameters for last path element # link_url.query => Query component try: print(urllib.parse.unquote(link_url.geturl())) # This reassembles the parts of the url to the whole thing except: pass