def fetch_ips(q: Queue, validator_queue: Queue): logger.debug('fetch_ips...') worker = Worker() while True: try: provider: BaseProvider = q.get()() provider_name = provider.__class__.__name__ logger.debug('Get a provider from the provider queue: ' + provider_name) for url in provider.urls(): try: html = worker.get_html( url, render_js=provider.should_render_js()) except Exception as e: logger.error("worker.get_html failed: %s", e) continue if html: proxies = provider.parse(html) for p in proxies: validator_queue.put(p) # logger.debug('Put new proxy ip into queue: {}'.format(p.__str__())) logger.info( ' {}: feed {} potential proxies into the validator queue' .format(provider_name, len(proxies))) except (KeyboardInterrupt, InterruptedError, SystemExit): worker.stop() logger.info('worker_process exited.') break
class ProxylistsProvider(BaseProvider): def __init__(self): self.w = Worker() self.country_patten = re.compile('^/(.+)_0.html$') def parse(self, html: HTML) -> [ProxyIP]: ip_list: [ProxyIP] = [] for tr in html.find('table table tr'): ip_element = tr.find('td:nth-of-type(1)', first=True) port_element = tr.find('td:nth-of-type(2)', first=True) if ip_element and port_element: ip = re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ip_element.text).group(0) port = re.search(r'\d{2,5}', port_element.text).group(0) proxy = ProxyIP(ip=ip, port=port, provider=self.__class__.__name__) ip_list.append(proxy) return ip_list def urls(self) -> [str]: ret = set([]) country_url = 'http://www.proxylists.net/countries.html' country_page = self.w.get_html(country_url, False) for a in country_page.find('a'): relative_path = a.attrs['href'] if self.country_patten.match(relative_path): ret.update( self.gen_url_for_country( self.country_patten.findall(relative_path)[0])) break return list(ret) def gen_url_for_country(self, country) -> [str]: ret = [] first_page = self.w.get_html( 'http://www.proxylists.net/{}_0.html'.format(country), False) for a in first_page.find('table table tr:last-of-type a'): ret.append('http://www.proxylists.net/{}'.format(a.attrs['href'])) return ret @staticmethod def should_render_js() -> bool: return True
class ProxyListProvider(BaseProvider): def __init__(self): super().__init__() self.w = Worker() def parse(self, document: PyQuery) -> List[ProxyIP]: ip_list: List[ProxyIP] = [] if document is None: return [] for ul in document.find('#proxy-table > div.table-wrap ul'): js_code_element = ul.find('li.proxy script') if not js_code_element: return [] js_code = js_code_element.text() matched = re.findall(r"Proxy\('(.+)'\)", js_code) if matched and len(matched) > 0: encoded = matched[0] ip_port = base64.b64decode(encoded).decode("utf-8") ip = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ip_port)[0] port = re.findall(r':(\d{2,5})', ip_port)[0] ip_list.append(ProxyIP(ip=ip, port=port)) return ip_list def urls(self) -> List[str]: ret = [] first_url = 'http://proxy-list.org/english/index.php?p=1' first_page = self.w.get_html(first_url, False) if first_page: ret.append(first_url) for a in first_page.find( '#content div.content div.table-menu a.item'): relative_path = a.attrib['href'] absolute_url = urllib.parse.urljoin(first_url, relative_path) ret.append(absolute_url) return ret @staticmethod def should_render_js() -> bool: return False
def fetch_ips(q: Queue, validator_queue: Queue, run_once=False): logger.debug('worker_process started.') logger.info('fetching ips...') worker = Worker() while True: try: if run_once and q.empty(): raise SystemExit break provider: BaseProvider = q.get() provider_name = provider.__class__.__name__ logger.info('Get a provider from the provider queue: ' + provider_name) for url in provider.urls(): html = worker.get_html(url, render_js=provider.should_render_js()) if html: proxies = provider.parse(html) for p in proxies: validator_queue.put(p) # logger.debug('Put new proxy ip into queue: {}'.format(p.__str__())) logger.info( ' {}: feed {} potential proxies into the validator queue' .format(provider_name, len(proxies))) except (KeyboardInterrupt, InterruptedError, SystemExit): worker.stop() break except pyppeteer.errors.PyppeteerError as e: logger.error( """pyppeteer.errors.PyppeteerError detected: %s\n 'Please make sure you have installed all the dependencies for chromium correctly""", e) break logger.debug('worker_process exited.')
class ProxyListProvider(BaseProvider): def __init__(self): self.w = Worker() def parse(self, html: HTML) -> [ProxyIP]: ip_list: [ProxyIP] = [] if html is None: return [] for ul in html.find('#proxy-table > div.table-wrap ul'): js_code = ul.find('li.proxy script', first=True).text matched = re.findall(r"Proxy\('(.+)'\)", js_code) if matched and len(matched) > 0: encoded = matched[0] ip_port = base64.b64decode(encoded).decode("utf-8") ip = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ip_port)[0] port = re.findall(r':(\d{2,5})', ip_port)[0] proxy = ProxyIP(ip=ip, port=port, provider=self.__class__.__name__) ip_list.append(proxy) return ip_list def urls(self) -> [str]: ret = [] first_url = 'http://proxy-list.org/english/index.php?p=1' sub = first_url[0:first_url.rfind('/')] # http://proxy-list.org/english first_page = self.w.get_html(first_url, False) ret.append(first_url) for a in first_page.find('#content div.content div.table-menu a.item'): relative_path = a.attrs['href'] absolute_url = sub + relative_path[relative_path.find('/'):] ret.append(absolute_url) return ret @staticmethod def should_render_js() -> bool: return False
def fetch_ips(q: Queue, validator_queue: Queue): logger.debug('fetch_ips...') worker = Worker() while True: try: provider: BaseProvider = q.get() provider_name = provider.__class__.__name__ logger.debug('Get a provider from the provider queue: ' + provider_name) for url in provider.urls(): html = worker.get_html(url, render_js=provider.should_render_js()) if html: proxies = provider.parse(html) for p in proxies: validator_queue.put(p) logger.debug('Put new proxy ip into queue: {}'.format( p.__str__())) logger.info( ' {}: feed {} potential proxies into the validator queue' .format(provider_name, len(proxies))) except (KeyboardInterrupt, InterruptedError, SystemExit): logger.info('worker_process exited.') break except pyppeteer.errors.PyppeteerError as e: logger.debug( 'pyppeteer.errors.PyppeteerError detected: {}\n'.format(e) + 'Please make sure you have installed all the dependencies for chromium correctly' ) except Exception as e: worker = Worker() # reset worker logger.warning('Unhandled exception is detected: {}'.format(e))