def get_html_tree(url, headers=None, cookie=None, proxy=None, data=None, verify=False): if headers is None: headers = HEADERS try: if data is not None: response = requests.post(url=url, headers=headers, cookies=cookie, timeout=10, proxies=proxy, verify=verify, data=data) else: response = requests.get(url=url, headers=headers, cookies=cookie, timeout=10, proxies=proxy, verify=verify) response.raise_for_status() response.encoding = response.apparent_encoding html = response.text if isinstance(html, bytes): html = html.decode("utf-8") time.sleep(1) return etree.HTML(html) except Exception as e: log.error("{0}".format(e)) raise e
def get_html(url, headers=None, cookie=None, proxy=None, data=None, verify=False): if headers is None: headers = HEADERS try: if data is not None: response = requests.post(url=url, headers=headers, cookies=cookie, timeout=10, proxies=proxy, verify=verify, data=data) else: response = requests.get(url=url, headers=headers, cookies=cookie, timeout=10, proxies=proxy, verify=verify) # response.raise_for_status() response.encoding = response.apparent_encoding return response.text except Exception as e: log.error("{0}".format(e)) raise Exception(e)
def __init__(self, database=None, url_prefix=None, fetcher=None, checker=None): if not database: self.database = RedisWrapper("127.0.0.1", 6379, 0) else: self.database = RedisWrapper(database.host, database.port, database.db, database.password) self._origin_prefix = 'origin_proxy' self._useful_prefix = 'useful_proxy' self._hundred_prefix = 'hundred_proxy' self._current_prefix = 'current_proxy' if not url_prefix: self._url_prefix = "default" else: self._url_prefix = url_prefix if not fetcher: # validater self._fetcher = Fetcher() else: # refresher self._fetcher = fetcher self._fetcher.backup_provider() log.error("REFRESH FETCHER BACKUP PROVIDER {0}".format( str(self._fetcher))) if not checker: self._checker = Checker() else: self._checker = checker self.log = log
def validate(target_url, proxy, checker): if target_url == "default": target_url = "https://www.baidu.com" proxies = { "http": "http://{proxy}".format(proxy=proxy), "https": "http://{proxy}".format(proxy=proxy) } try: r = requests.get(target_url, proxies=proxies, timeout=checker.timeout, verify=False, headers=HEADERS_IPHONE) if r.status_code == 200: if checker.checker_func(r.content): log.info('validate success target {0} proxy {1}'.format( target_url, proxy)) return True else: return False else: return False except Exception as e: log.error("validate failed with {0}".format(e)) return False
def validate(target_url, proxy): if target_url == "default": target_url = "https://www.baidu.com" proxies = {"https": "https://{proxy}".format(proxy=proxy)} else: if urlparse(target_url).scheme == "https": proxies = {"https": "https://{proxy}".format(proxy=proxy)} else: proxies = { "http": "http://{proxy}".format(proxy=proxy), "https": "http://{proxy}".format(proxy=proxy) } try: r = requests.get(target_url, proxies=proxies, timeout=5, verify=False, headers=HEADERS_IPHONE) if r.status_code == 200: log.info('validate success target {0} proxy{1}'.format( target_url, proxy)) return True else: return False except Exception as e: log.error("{0}".format(e)) return False
def _do_data_forward(self, sock_in, sock_out): addr_in = '%s:%d' % sock_in.getpeername() addr_out = '%s:%d' % sock_out.getpeername() while True: try: data = sock_in.recv(ForwardServer.PAGE_SIZE) except Exception as e: log.error('Socket read error of %s: %s' % (addr_in, str(e))) break if not data: log.info('Socket closed by ' + addr_in) break try: sock_out.sendall(data) except Exception as e: log.error('Socket write error of %s: %s' % (addr_out, str(e))) break log.info('%s -> %s (%d B)' % (addr_in, addr_out, len(data))) sock_in.close() sock_out.close()
def _forward(self, sock_in): try: print("Remote host and remote port", self.default_remote_host, self.default_remote_port) sock_out = ForwardClient(self.default_remote_host, self.default_remote_port).get_client() log.info('get the client socks done') except Exception as e: log.error('Get Remote Client error: %s' % str(e)) raise e threading.Thread(target=self._do_data_forward, args=(sock_in, sock_out)).start() threading.Thread(target=self._do_data_forward, args=(sock_out, sock_in)).start()
def parse(self, url): try: self.driver.get(url) html = self.driver.page_source return etree.HTML(html) except Exception as e: log.error("{0}".format(e)) raise Exception(e) finally: self.wd.release(self.driver) self.wd.stop()
def get_html(url, headers=None): if headers is None: headers = HEADERS try: response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() response.encoding = response.apparent_encoding return response.text except Exception as e: log.error("{0}".format(e)) return
def get_client(self): sock_out = socks.socksocket(socket.AF_INET, socket.SOCK_STREAM) try: print('remote,=', (self.remote_host, self.remote_port)) sock_out.connect((self.remote_host, self.remote_port)) except socket.error as e: sock_out.close() log.error('Remote connect error: %s' % str(e)) raise Exception('Remote connect error: %s' % str(e)) return sock_out
def _forward(self, sock_in): try: sock_out = ForwardClient() log.info('get the client socks done') except Exception as e: log.error('Get Remote Client error: %s' % str(e)) raise e threading.Thread(target=self._do_data_forward, args=(sock_in, sock_out)).start() threading.Thread(target=self._do_data_forward, args=(sock_out, sock_in)).start()
def get_image_result(self, image_url): try: ir = requests.get(image_url, headers=HEADERS, timeout=10) except Exception as e: log.error("Error fetching captcha {0}".format(e)) raise Exception(e) if ir.status_code == 200: post_data = {"image": base64.b64encode(ir.content)} res = requests.post(self.crack_url, data=post_data) answer = str(res.content, encoding="utf-8") return answer else: log.error("Error cracking captcha {0}".format(ir.status_code)) raise Exception("Error cracking captcha {0}".format( ir.status_code))
def get_html_tree(url, headers=None): if headers is None: headers = HEADERS try: response = requests.get(url=url, headers=headers, timeout=30) response.raise_for_status() response.encoding = response.apparent_encoding html = response.content if isinstance(html, bytes): html = html.decode("utf-8") time.sleep(1) return etree.HTML(html) except Exception as e: log.error("{0}".format(e)) return
def refresh(self): log.info("REFRESH START WITH {0} TARGET {1}".format( str(self._fetcher), self.get_netloc())) if not self.refresh_condition(): log.info("REFRESH DID NOT MEET CONDITION. TARGET{0}".format( self.get_netloc())) return if len(self._fetcher) < 6: self._fetcher.restore_provider() log.info( "REFRESH FETCHER FAILED: NO ENOUGH PROVIDER, RESTORE PROVIDERS TO {0} for TARGET {1}" .format(str(self._fetcher), self.get_netloc())) proxy_set = set() provider_to_be_removed_index = [] for index in range(len(self._fetcher)): provider = self._fetcher.get_provider(index) try: for proxy in provider.getter(): if proxy.strip(): self.log.info( "REFRESH FETCHER: TARGET {0} PROVIDER {1} PROXY {2}" .format(self.get_netloc(), provider.__class__.__name__, proxy.strip())) proxy_set.add(proxy.strip()) except Exception as e: provider_to_be_removed_index.append(index) log.error( "REFRESH FETCHER FAILED: PROVIDER {0} WILL BE REMOVED ERROR {1}" .format(provider.__class__.__name__, e)) for proxy in proxy_set: self.database.set_value("spoon:proxy_stale", proxy, time.time()) self.database.put(self.generate_name(self._origin_prefix), proxy) log.info("REFRESH FETCHER DELETE {0}. TARGET {1}".format( provider_to_be_removed_index, self.get_netloc())) self._fetcher.remove_provider(provider_to_be_removed_index)
def serve(self): sock_server = self._listen() while not is_exit: try: sock, addr = sock_server.accept() except (KeyboardInterrupt, SystemExit): log.warn('Closing...') sock_server.shutdown(socket.SHUT_RDWR) sock_server.close() break except Exception as e: log.error('Exception exit {0}'.format(e)) sock_server.shutdown(socket.SHUT_RDWR) sock_server.close() break threading.Thread(target=self._forward, args=(sock,)).start() log.info('New clients from {0}'.format(addr)) log.info('exit server')
def _do_data_forward(self, sock_in, sock_out): if isinstance(sock_in, ForwardClient): sock_in = sock_in.get_client(self.default_remote_host, self.default_remote_port) addr_in = '%s:%d' % sock_in.getpeername() while True: try: data = sock_in.recv(ForwardServer.PAGE_SIZE) if isinstance(sock_out, ForwardClient): print("sock_in", data) if b'Host' in data: host_match = re.match(r'.*Host:\s(.*?)\r\n.*', data.decode("utf-8"), re.S) if host_match: hostname = host_match[1] current_proxy_list = self.m.get_range_from( ":".join(["spoon", hostname, "current_proxy"])) if current_proxy_list: ran_num = random.randint( 0, len(current_proxy_list) // 3) proxy = current_proxy_list[ran_num].decode( "utf-8") sock_out = sock_out.get_client( proxy.split(":")[0], int(proxy.split(":")[1])) log.info( "Change Remote Proxy: {0}".format(proxy)) else: log.info( "Change Remote Proxy: ", self.default_remote_host + ":" + self.default_remote_port) sock_out = sock_out.get_client( self.default_remote_host, self.default_remote_port) sock_out = sock_out.get_client(self.default_remote_host, self.default_remote_port) except Exception as e: if isinstance(sock_out, ForwardClient): sock_out = sock_out.get_client(self.default_remote_host, self.default_remote_port) log.error('Socket read error of %s: %s' % (addr_in, str(e))) break if not data: log.info('Socket closed by ' + addr_in) break addr_out = '%s:%d' % sock_out.getpeername() try: sock_out.sendall(data) except Exception as e: log.error('Socket write error of %s: %s' % (addr_out, str(e))) break log.info('%s -> %s (%d B)' % (addr_in, addr_out, len(data))) sock_in.close() sock_out.close()