Beispiel #1
0
def get_proxies(n=5):
    """Read some notoriously known sites and extract some public proxies.

    Scrapes
        - http://www.samair.ru/proxy/

    The quality of these proxies is probably not worth to be mentioned, but it's
    nice to test the lack of quality and the behaviour of GoogleScraper.
    """
    r = requests.get('http://www.samair.ru/proxy/')
    # Try to parse the google HTML result using lxml
    try:
        doc = UnicodeDammit(r.text, is_html=True)
        parser = lxml.html.HTMLParser(encoding=doc.declared_html_encoding)
        dom = lxml.html.document_fromstring(r.text, parser=parser)
        dom.resolve_base_href()
    except Exception as e:
        print('Some error occurred while lxml tried to parse: {}'.format(e))

    table = dom.xpath('//table[@id=\'proxylist\']')[0]
    for row in table.findall('tr'):
        print(row.xpath('//td[1]')[0].text_content())

    return GoogleScraper.Proxy()
Beispiel #2
0
if __name__ == '__main__':
    # See for the config.cfg file for possible values
    GoogleScraper.Config.update({
        'SCRAPING': {
            'use_own_ip': 'False',
            'keyword': 'HelloWorld'
        },
        'SELENIUM': {
            'sel_browser': 'chrome', # change to 'phantomjs' if you want so
            'manual_captcha_solving': 'True'
        }
    })

    # sample proxy
    proxy = GoogleScraper.Proxy(proto='socks5', host='localhost', port=9050, username='', password='')

    try:
        results = GoogleScraper.scrape('Best SEO tool', scrapemethod='sel')#, proxy=proxy)
        for page in results:
            for link_title, link_snippet, link_url, *rest in page['results']:
                # You can access all parts of the search results like that
                # link_url.scheme => URL scheme specifier (Ex: 'http')
                # link_url.netloc => Network location part (Ex: 'www.python.org')
                # link_url.path => URL scheme specifier (Ex: ''help/Python.html'')
                # link_url.params => Parameters for last path element
                # link_url.query => Query component
                try:
                    print(urllib.parse.unquote(link_url.geturl())) # This reassembles the parts of the url to the whole thing
                except:
                    pass