def save(self): ## create diff Proxy.objects.filter(delegating=self.user).delete() newproxy = Proxy(delegating=self.user, isdefault=True) newproxy.save() if 'main_proxy' in self.data.keys(): for user in self.data.getlist('main_proxy'): user_object = CustomUser.objects.get(pk=user) newproxy.delegates.add(user_object) newproxy.save() for count in xrange(int(self.data["tagfieldcount"])): if "side_proxy%d" % count in self.data.keys( ) and "side_proxy_tags%d" % count in self.data.keys(): newproxy = Proxy(delegating=self.user) newproxy.save() for user in self.data.getlist("side_proxy%d" % count): user_object = CustomUser.objects.get(pk=user) newproxy.delegates.add(user_object) for tag in self.data.getlist("side_proxy_tags%d" % count): tag_object = Tag.objects.get(pk=tag) newproxy.tags.add(tag_object) newproxy.save() return
def save(self): ### first disable all existing proxies (suboptimal, but easiest) ### for proxy in Proxy.objects.filter(delegating=self.user): proxy.disable() ### add new proxy for each form element ### newproxy = Proxy(delegating=self.user, isdefault=True) newproxy.save() if 'main_proxy' in self.data.keys(): for user in self.data.getlist('main_proxy'): user_object = CustomUser.objects.get(pk=user) newproxy.delegates.add(user_object) newproxy.save() for count in xrange(int(self.data["tagfieldcount"])): if "side_proxy%d" % count in self.data.keys( ) and "side_proxy_tags%d" % count in self.data.keys(): newproxy = Proxy(delegating=self.user) newproxy.save() for user in self.data.getlist("side_proxy%d" % count): user_object = CustomUser.objects.get(pk=user) newproxy.delegates.add(user_object) for tag in self.data.getlist("side_proxy_tags%d" % count): tag_object = Tag.objects.get(pk=tag) newproxy.tags.add(tag_object) newproxy.save() return
def filte(content): soup = BeautifulSoup(content) proxy_list_info = soup.findAll('tr') proxy_list = [] for proxy in proxy_list_info: td_index = 0 proxy_tds = proxy.findAll('td') has_get = False proxy = Proxy() for proxy_td in proxy_tds: td_index += 1 if td_index == 2: has_get = True proxy.ip = proxy_td.text elif td_index == 3: proxy.port = proxy_td.text elif td_index == 4: if not proxy_td.a == None: proxy.location = proxy_td.a.text elif td_index == 5: proxy.anonymous_type = proxy_td.text elif td_index == 6: proxy.proxy_type = proxy_td.text.lower() if has_get: proxy_list.append(proxy) return proxy_list
def load(self) -> list: ls = [] if self._num is None: return ls if self._context and self._context.logger: self._context.logger.info('SixSixIPProxySpider: loading proxy list.') url = SixSixIPProxySpider._POOL_URL.format(self._num) reg = re.compile(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)(?=<br />)') try: res = requests.get(url, proxies=self._sys_proxy, timeout=self._timeout) for match in reg.finditer(res.text): try: for protocol in ('http', 'https'): proxy = Proxy() proxy.ip = match.group(1) proxy.port = match.group(2) proxy.protocol = protocol proxy.proxy_url = self.proxy_url(proxy.ip, proxy.port, proxy.protocol) proxy.collect_time = Datetime.now() proxy.local = Config.local ls.append(proxy) except: pass return ls except: if self._context and self._context.logger: self._context.logger.exception('SixSixIPProxySpider: Failed be load proxy list.') raise
def addProxies(s): s = s.splitlines() for i in s: try: Proxy(ip_and_port=i, proxy_type='http').save() except: pass
def load(self) -> list: ls = [] if self._context and self._context.logger: self._context.logger.info('FatezeroProxySpider: loading proxy list.') try: res = requests.get(FatezeroProxySpider._POOL_URL, proxies=self._sys_proxy, timeout=self._timeout) for text in res.text.split('\n'): try: p = json.loads(text, encoding='utf-8') proxy = Proxy() proxy.ip = p['host'] proxy.port = p['port'] proxy.protocol = p['type'] proxy.proxy_url = self.proxy_url(proxy.ip, proxy.port, proxy.protocol) proxy.collect_time = Datetime.now() proxy.local = Config.local ls.append(proxy) except: pass if self._num is None: return ls else: return ls[:self._num] except: if self._context and self._context.logger: self._context.logger.exception('FatezeroProxySpider: Failed be load proxy list.') raise
def save_proxies(url): try: r = fetch(url) except requests.exceptions.RequestException: return False addresses = re.findall(PROXY_REGEX, r.text) for address in addresses: proxy = Proxy(address=address) try: proxy.save() except NotUniqueError: pass
def create_or_update_proxy(raw_protocol: Proxy.PROTOCOLS, auth_data, domain, port, start_checking_time, end_checking_time): if raw_protocol is None or domain is None or port is None or auth_data is None or start_checking_time is None\ or end_checking_time is None: raise Exception("Bad arguments") if raw_protocol < 0 or raw_protocol >= len(Proxy.PROTOCOLS): raise Exception("Bad protocol") response_time = int( round((end_checking_time - start_checking_time) * 1000000)) proxy = session.query(Proxy).filter( sqlalchemy.and_(Proxy.raw_protocol == raw_protocol, Proxy.auth_data == auth_data, Proxy.domain == domain, Proxy.port == port)).first() if proxy: # exists, so update pass else: # doesn't exist, so create proxy = Proxy(number_of_bad_checks=0, raw_protocol=raw_protocol, auth_data=auth_data, domain=domain, port=port) session.add(proxy) if proxy.bad_proxy or proxy.uptime is None or proxy.uptime == 0: proxy.uptime = int(time.time()) if proxy.bad_uptime is None or proxy.bad_uptime == 0 or \ proxy.number_of_bad_checks > settings.DEAD_PROXY_THRESHOLD: proxy.bad_uptime = int(time.time()) proxy.response_time = response_time proxy.number_of_bad_checks = 0 proxy.last_check_time = int(time.time()) checking_time = int(end_checking_time - start_checking_time) if checking_time > settings.PROXY_CHECKING_TIMEOUT: checking_time = settings.PROXY_CHECKING_TIMEOUT proxy.checking_period = \ settings.MIN_PROXY_CHECKING_PERIOD \ + (checking_time / settings.PROXY_CHECKING_TIMEOUT) \ * (settings.MAX_PROXY_CHECKING_PERIOD - settings.MIN_PROXY_CHECKING_PERIOD) session.commit()
def add_proxy(request): urls = request.body print urls urlList = urls.split(',') rst = [] for url in urlList: try: p = Proxy(url=url, rate=0) p.save() except Exception as e: print rst.append(e.message) continue return HttpResponse(reduce(lambda x, y: x + ',' + y, rst))
def __execute_one_spider_task(self, spider): """ 一次执行具体爬虫的任务 :param spider: :return: """ try: for proxy in spider.get_proxies(): proxy = check_proxy(proxy) if proxy.speed != -1: session = Session() exist = session.query(Proxy) \ .filter(Proxy.ip == str(proxy.ip), Proxy.port == str(proxy.port)) \ .first() if not exist: obj = Proxy( ip=str(proxy.ip), port=str(proxy.port), protocol=proxy.protocol, nick_type=proxy.nick_type, speed=proxy.speed, area=str(proxy.area), score=proxy.score, disable_domain=proxy.disable_domain, origin=str(proxy.origin), create_time=datetime.now() ) session.add(obj) session.commit() session.close() logger.info(f'insert: {proxy.ip}:{proxy.port} from {proxy.origin}!') else: exist.score['score'] = settings.MAX_SCORE exist.score['power'] = 0 exist.port = proxy.port exist.protocol = proxy.protocol exist.nick_type = proxy.nick_type exist.speed = proxy.speed exist.area = proxy.area exist.disable_domain = proxy.disable_domain exist.origin = proxy.origin session.commit() session.close() logger.info(f'update: {proxy.ip}:{proxy.port}, to max score successfully!') else: logger.info(f'invalid: {proxy.ip}:{proxy.port} from {proxy.origin}!') except Exception as e: logger.error(f'spider error: {e}')
async def process_raw_proxy(self, proxy, collector_id): self.logger.debug("processing raw proxy \"{}\"".format(proxy)) try: _, auth_data, domain, port = proxy_validator.retrieve(proxy) except proxy_validator.ValidationError as ex: self.collectors_logger.error( "Collector with id \"{}\" returned bad raw proxy \"{}\". " "Message: {}".format(collector_id, proxy, ex) ) return # don't care about protocol try: proxy = await db.get( Proxy.select().where( Proxy.auth_data == auth_data, Proxy.domain == domain, Proxy.port == port, ) ) if proxy.last_check_time + settings.PROXY_NOT_CHECKING_PERIOD >= time.time(): proxy_short_address = "" if auth_data: proxy_short_address += auth_data + "@" proxy_short_address += "{}:{}".format(domain, port) self.logger.debug( "skipping proxy \"{}\" from collector \"{}\"".format( proxy_short_address, collector_id) ) return except Proxy.DoesNotExist: pass for raw_protocol in range(len(Proxy.PROTOCOLS)): while not self.good_proxies_are_processed: # TODO: find a better way await asyncio.sleep(0.1) new_proxy = Proxy() new_proxy.raw_protocol = raw_protocol new_proxy.auth_data = auth_data new_proxy.domain = domain new_proxy.port = port await self.add_proxy_to_queue(new_proxy, collector_id)
def add_proxy(request): if request.method == 'POST': port = request.POST.get('port', '0') destination = request.POST.get('destination', 'https://empty') delay = request.POST.get('delay', '0') p = Proxy(delay=delay, port=port, destination=destination, started=False, pid=0) p.save() new_id = p.id proxy = Proxy.objects.filter(id=new_id).values() response = [{"message": "proxy was started", "proxy": list(proxy)[0]}] return JsonResponse(response, safe=False)
def use_thread_with_queue2(): cleanup() # in_queue = Queue.Queue() # out_queue = Queue.Queue() # # for i in range(5): # t = threading.Thread(target=save_proxies_with_queue2, # args=(in_queue, out_queue)) # t.setDaemon(True) # t.start() # # for url in PROXY_SITES: # in_queue.put(url) # # result = [] # # for i in range(5): # t = threading.Thread(target=append_result, # args=(out_queue, result)) # t.setDaemon(True) # t.start() # # in_queue.join() # out_queue.join() addresses = [] # mogu_key = "" # res = requests.get(mogu_key) # addresses = res.json()['msg'] for address in addresses: proxy = Proxy(address=address['ip'] + ':' + address['port']) try: proxy.save() except NotUniqueError: pass pool = Pool(10) pool.map(check_proxy, Proxy.objects.all()) print(len(addresses)) print(Proxy.objects.count())
def save_proxies(url): proxies = [] try: # if url == 'http://www.kuaidaili.com/free': # import pdb; # pdb.set_trace() res = requests.get(url) except requests.exceptions.RequestException: return False addresses = re.findall(PROXY_REGEX, res.text) for address in addresses: proxy = Proxy(address=address) try: proxy.save() except NotUniqueError: pass else: proxies.append(address) return proxies
def get_ips(self): for pat in pats: objs = [] headers = { 'content-type': 'application/json', 'User-Agent': random.choice(agents) } for i in range(100): ip_obj = self.session.query(Proxy).order_by( func.random()).first() proxies = { '{type}'.format(type=ip_obj.type): '{type}://{ip}:{port}'.format(type=ip_obj.type, ip=ip_obj.ip, port=ip_obj.port) } url = '{base_url}{pat}{page}'.format(base_url=base_url, pat=pat, page=i) logger.info('Scrapy {url}'.format(url=url)) try: response = requests.get(url, headers=headers, proxies=proxies) if response.status_code == 200: selector = etree.HTML(response.text) for line in selector.xpath( '//table[@id="ip_list"]//tr[@class="odd"]'): proxy_obj = Proxy() proxy_obj.id = str(uuid.uuid1()) proxy_obj.ip = line.xpath('td')[1].xpath( 'text()')[0] proxy_obj.port = line.xpath('td')[2].xpath( 'text()')[0] proxy_obj.type = str( line.xpath('td')[5].xpath('text()') [0]).lower().replace('https', 'http') objs.append(proxy_obj) except: pass self._threads_check(objs)
def filte(content): soup = BeautifulSoup(content) proxy_list_tables = soup.findAll('table') table_index = 0 pattern = re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') proxy_list = [] for table in proxy_list_tables: table_index += 1 if table_index == 3: proxy_list_info = table.findAll('tr') for proxy in proxy_list_info: td_index = 0 proxy_tds = proxy.findAll('td') proxy = Proxy() is_proxy = False for proxy_td in proxy_tds: td_index += 1 if td_index == 2: rel_ip_info = re.search(pattern, proxy_td.text) if rel_ip_info: proxy.ip = rel_ip_info.group(0) is_proxy = True elif td_index == 3: if is_proxy: proxy.port = int(proxy_td.text) elif td_index == 4: if is_proxy: if '匿名代理' == proxy_td.text or '高度匿名' == proxy_td.text: proxy.anonymous_type = '高匿' else: proxy.anonymous_type = '透明' elif td_index == 5: if is_proxy: proxy.location = proxy_td.text proxy.proxy_type = 'http' if is_proxy: proxy_list.append(proxy) return proxy_list
def proxy(operation=None, proxy_id=None): if operation and proxy_id and operation == 'delete': try: rm_proxy = Proxy.query.get(proxy_id) db.session.delete(rm_proxy) db.session.commit() flash(u'Proxy removed', 'success') except Exception: flash(u'Proxy does not exist', 'error') if request.method == 'POST': proxy = Proxy(ptype=request.values.get('type', None), ip=request.values.get('ip', None), port=request.values.get('port', None)) db.session.add(proxy) db.session.commit() flash(u'Proxy added', 'success') proxy_list = [] results = Proxy.query.all() if results: for result in results: proxy_list.append({ 'id': result.id, 'ptype': result.ptype, 'ip': result.ip, 'port': result.port, 'status': result.status, }) return render_template('proxy.html', data=proxy_list, base_url=BASE_SITE_URL, scrapyd_url=scrapyd_url)
def set_proxy(bot, update, args): if len(args) == 1: proxy_data = args[0].split(':') if len(proxy_data) != 4: update.message.reply_text( "Please, include the proxy data to the " "command, like in the example:\n" "<code>/set_proxy mydomain.com:8080:usErnAme:s3cret</code>", parse_mode=ParseMode.HTML) return proxy_ip, proxy_port, proxy_username, proxy_password = proxy_data current_proxy = session.query(Proxy).first() if current_proxy: session.delete(current_proxy) new_proxy = Proxy(proxy_ip, proxy_port, proxy_username, proxy_password) session.add(new_proxy) session.commit() update.message.reply_text("Proxy settings updated.") else: update.message.reply_text( "Please, include the proxy data to the " "command, like in the example:\n" "<code>/set_proxy mydomain.com:8080:usErnAme:s3cret</code>", parse_mode=ParseMode.HTML)
def __init__(self, proxy_test_filter, context=None): super().__init__(context=context) self._proxy_test_filter = proxy_test_filter self._entity = Proxy()
return (True, (time.time() - time1) * 1000) return (False, 0) except: return (False, 0) def check_google(proxy_info): proxy_content = proxy_info.ip + ':' + str(proxy_info.port) proxy = urllib2.ProxyHandler({proxy_info.proxy_type : proxy_content}) opener = urllib2.build_opener(proxy) urllib2.install_opener(opener) try: time1 = time.time() response = urllib2.urlopen(GOOGLE_CHECK_URL, timeout=3) title = BeautifulSoup(response.read()).title.text if 'Google' == str(title): proxy_info.check_time = str(datetime.now()).split('.')[0] return (True, (time.time() - time1) * 1000) else: return (False, 0) except: return (False, 0) if __name__ == '__main__': proxy = Proxy() proxy.ip = '222.74.6.48' proxy.port = '8000' proxy.proxy_type = 'http' default_ip = get_default_ip() print check_anonymous(proxy, default_ip)
from models import Proxy from main_proxy import text_proxy_generator for proxy in text_proxy_generator(): proxy = Proxy(host_port=proxy) proxy.save()