def load(self) -> list: ls = [] if self._num is None: return ls if self._context and self._context.logger: self._context.logger.info('SixSixIPProxySpider: loading proxy list.') url = SixSixIPProxySpider._POOL_URL.format(self._num) reg = re.compile(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)(?=<br />)') try: res = requests.get(url, proxies=self._sys_proxy, timeout=self._timeout) for match in reg.finditer(res.text): try: for protocol in ('http', 'https'): proxy = Proxy() proxy.ip = match.group(1) proxy.port = match.group(2) proxy.protocol = protocol proxy.proxy_url = self.proxy_url(proxy.ip, proxy.port, proxy.protocol) proxy.collect_time = Datetime.now() proxy.local = Config.local ls.append(proxy) except: pass return ls except: if self._context and self._context.logger: self._context.logger.exception('SixSixIPProxySpider: Failed be load proxy list.') raise
def filte(content): soup = BeautifulSoup(content) proxy_list_info = soup.findAll('tr') proxy_list = [] for proxy in proxy_list_info: td_index = 0 proxy_tds = proxy.findAll('td') has_get = False proxy = Proxy(); for proxy_td in proxy_tds: td_index += 1 if td_index == 2: has_get = True proxy.ip = proxy_td.text elif td_index == 3: proxy.port = proxy_td.text elif td_index == 4: if not proxy_td.a == None: proxy.location = proxy_td.a.text elif td_index == 5: proxy.anonymous_type = proxy_td.text elif td_index == 6: proxy.proxy_type = proxy_td.text.lower() if has_get: proxy_list.append(proxy) return proxy_list
def load(self) -> list: ls = [] if self._context and self._context.logger: self._context.logger.info('FatezeroProxySpider: loading proxy list.') try: res = requests.get(FatezeroProxySpider._POOL_URL, proxies=self._sys_proxy, timeout=self._timeout) for text in res.text.split('\n'): try: p = json.loads(text, encoding='utf-8') proxy = Proxy() proxy.ip = p['host'] proxy.port = p['port'] proxy.protocol = p['type'] proxy.proxy_url = self.proxy_url(proxy.ip, proxy.port, proxy.protocol) proxy.collect_time = Datetime.now() proxy.local = Config.local ls.append(proxy) except: pass if self._num is None: return ls else: return ls[:self._num] except: if self._context and self._context.logger: self._context.logger.exception('FatezeroProxySpider: Failed be load proxy list.') raise
def filte(content): soup = BeautifulSoup(content) proxy_list_info = soup.findAll('tr') proxy_list = [] for proxy in proxy_list_info: td_index = 0 proxy_tds = proxy.findAll('td') has_get = False proxy = Proxy() for proxy_td in proxy_tds: td_index += 1 if td_index == 2: has_get = True proxy.ip = proxy_td.text elif td_index == 3: proxy.port = proxy_td.text elif td_index == 4: if not proxy_td.a == None: proxy.location = proxy_td.a.text elif td_index == 5: proxy.anonymous_type = proxy_td.text elif td_index == 6: proxy.proxy_type = proxy_td.text.lower() if has_get: proxy_list.append(proxy) return proxy_list
def get_ips(self): for pat in pats: objs = [] headers = { 'content-type': 'application/json', 'User-Agent': random.choice(agents) } for i in range(100): ip_obj = self.session.query(Proxy).order_by( func.random()).first() proxies = { '{type}'.format(type=ip_obj.type): '{type}://{ip}:{port}'.format(type=ip_obj.type, ip=ip_obj.ip, port=ip_obj.port) } url = '{base_url}{pat}{page}'.format(base_url=base_url, pat=pat, page=i) logger.info('Scrapy {url}'.format(url=url)) try: response = requests.get(url, headers=headers, proxies=proxies) if response.status_code == 200: selector = etree.HTML(response.text) for line in selector.xpath( '//table[@id="ip_list"]//tr[@class="odd"]'): proxy_obj = Proxy() proxy_obj.id = str(uuid.uuid1()) proxy_obj.ip = line.xpath('td')[1].xpath( 'text()')[0] proxy_obj.port = line.xpath('td')[2].xpath( 'text()')[0] proxy_obj.type = str( line.xpath('td')[5].xpath('text()') [0]).lower().replace('https', 'http') objs.append(proxy_obj) except: pass self._threads_check(objs)
def filte(content): soup = BeautifulSoup(content) proxy_list_tables = soup.findAll('table') table_index = 0 pattern = re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') proxy_list = [] for table in proxy_list_tables: table_index += 1 if table_index == 3: proxy_list_info = table.findAll('tr') for proxy in proxy_list_info: td_index = 0 proxy_tds = proxy.findAll('td') proxy = Proxy(); is_proxy = False for proxy_td in proxy_tds: td_index += 1 if td_index == 2: rel_ip_info = re.search(pattern, proxy_td.text) if rel_ip_info: proxy.ip = rel_ip_info.group(0) is_proxy = True elif td_index == 3: if is_proxy: proxy.port = int(proxy_td.text) elif td_index == 4: if is_proxy: if '匿名代理' == proxy_td.text or '高度匿名' == proxy_td.text: proxy.anonymous_type = '高匿' else: proxy.anonymous_type = '透明' elif td_index == 5: if is_proxy: proxy.location = proxy_td.text proxy.proxy_type = 'http' if is_proxy: proxy_list.append(proxy) return proxy_list
def filte(content): soup = BeautifulSoup(content) proxy_list_tables = soup.findAll('table') table_index = 0 pattern = re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') proxy_list = [] for table in proxy_list_tables: table_index += 1 if table_index == 3: proxy_list_info = table.findAll('tr') for proxy in proxy_list_info: td_index = 0 proxy_tds = proxy.findAll('td') proxy = Proxy() is_proxy = False for proxy_td in proxy_tds: td_index += 1 if td_index == 2: rel_ip_info = re.search(pattern, proxy_td.text) if rel_ip_info: proxy.ip = rel_ip_info.group(0) is_proxy = True elif td_index == 3: if is_proxy: proxy.port = int(proxy_td.text) elif td_index == 4: if is_proxy: if '匿名代理' == proxy_td.text or '高度匿名' == proxy_td.text: proxy.anonymous_type = '高匿' else: proxy.anonymous_type = '透明' elif td_index == 5: if is_proxy: proxy.location = proxy_td.text proxy.proxy_type = 'http' if is_proxy: proxy_list.append(proxy) return proxy_list
return (False, 0) except: return (False, 0) def check_google(proxy_info): proxy_content = proxy_info.ip + ":" + str(proxy_info.port) proxy = urllib2.ProxyHandler({proxy_info.proxy_type: proxy_content}) opener = urllib2.build_opener(proxy) urllib2.install_opener(opener) try: time1 = time.time() response = urllib2.urlopen(GOOGLE_CHECK_URL, timeout=3) title = BeautifulSoup(response.read()).title.text if "Google" == str(title): proxy_info.check_time = str(datetime.now()).split(".")[0] return (True, (time.time() - time1) * 1000) else: return (False, 0) except: return (False, 0) if __name__ == "__main__": proxy = Proxy() proxy.ip = "222.74.6.48" proxy.port = "8000" proxy.proxy_type = "http" default_ip = get_default_ip() print check_anonymous(proxy, default_ip)
return (True, (time.time() - time1) * 1000) return (False, 0) except: return (False, 0) def check_google(proxy_info): proxy_content = proxy_info.ip + ':' + str(proxy_info.port) proxy = urllib2.ProxyHandler({proxy_info.proxy_type : proxy_content}) opener = urllib2.build_opener(proxy) urllib2.install_opener(opener) try: time1 = time.time() response = urllib2.urlopen(GOOGLE_CHECK_URL, timeout=3) title = BeautifulSoup(response.read()).title.text if 'Google' == str(title): proxy_info.check_time = str(datetime.now()).split('.')[0] return (True, (time.time() - time1) * 1000) else: return (False, 0) except: return (False, 0) if __name__ == '__main__': proxy = Proxy() proxy.ip = '222.74.6.48' proxy.port = '8000' proxy.proxy_type = 'http' default_ip = get_default_ip() print check_anonymous(proxy, default_ip)
return (False, 0) except: return (False, 0) def check_google(proxy_info): proxy_content = proxy_info.ip + ':' + str(proxy_info.port) proxy = urllib2.ProxyHandler({proxy_info.proxy_type : proxy_content}) opener = urllib2.build_opener(proxy) urllib2.install_opener(opener) try: time1 = time.time() response = urllib2.urlopen(GOOGLE_CHECK_URL, timeout=3) title = BeautifulSoup(response.read()).title.text if 'Google' == str(title): proxy_info.check_time = str(datetime.now()).split('.')[0] return (True, (time.time() - time1) * 1000) else: return (False, 0) except: return (False, 0) if __name__ == '__main__': proxy = Proxy() proxy.ip = '222.74.6.48' proxy.port = '8000' proxy.proxy_type = 'http' default_ip = get_default_ip() print check_anonymous(proxy, default_ip)