def save(self): ## create diff Proxy.objects.filter(delegating=self.user).delete() newproxy = Proxy(delegating=self.user,isdefault=True) newproxy.save() if 'main_proxy' in self.data.keys(): for user in self.data.getlist('main_proxy'): user_object = CustomUser.objects.get(pk=user) newproxy.delegates.add(user_object) newproxy.save() for count in xrange(int(self.data["tagfieldcount"])): if "side_proxy%d"%count in self.data.keys() and "side_proxy_tags%d"%count in self.data.keys(): newproxy = Proxy(delegating=self.user) newproxy.save() for user in self.data.getlist("side_proxy%d"%count): user_object = CustomUser.objects.get(pk=user) newproxy.delegates.add(user_object) for tag in self.data.getlist("side_proxy_tags%d"%count): tag_object = Tag.objects.get(pk=tag) newproxy.tags.add(tag_object) newproxy.save() return
def save(self): ### first disable all existing proxies (suboptimal, but easiest) ### for proxy in Proxy.objects.filter(delegating=self.user): proxy.disable() ### add new proxy for each form element ### newproxy = Proxy(delegating=self.user, isdefault=True) newproxy.save() if 'main_proxy' in self.data.keys(): for user in self.data.getlist('main_proxy'): user_object = CustomUser.objects.get(pk=user) newproxy.delegates.add(user_object) newproxy.save() for count in xrange(int(self.data["tagfieldcount"])): if "side_proxy%d" % count in self.data.keys() and "side_proxy_tags%d" % count in self.data.keys(): newproxy = Proxy(delegating=self.user) newproxy.save() for user in self.data.getlist("side_proxy%d"%count): user_object = CustomUser.objects.get(pk=user) newproxy.delegates.add(user_object) for tag in self.data.getlist("side_proxy_tags%d"%count): tag_object = Tag.objects.get(pk=tag) newproxy.tags.add(tag_object) newproxy.save() return
async def wrap(self, *args, **kwargs): good_proxies_count = await db.count( Proxy.select().where(Proxy.number_of_bad_checks == 0)) bad_proxies_count = await db.count(Proxy.select().where( Proxy.number_of_bad_checks > 0, Proxy.number_of_bad_checks < settings.DEAD_PROXY_THRESHOLD, )) dead_proxies_count = await db.count(Proxy.select().where( Proxy.number_of_bad_checks >= settings.DEAD_PROXY_THRESHOLD, Proxy.number_of_bad_checks < settings.DO_NOT_CHECK_ON_N_BAD_CHECKS, )) not_checked_proxies_count = await db.count(Proxy.select().where( Proxy.number_of_bad_checks >= settings.DO_NOT_CHECK_ON_N_BAD_CHECKS, )) response = { "bad_proxies_count": bad_proxies_count, "good_proxies_count": good_proxies_count, "dead_proxies_count": dead_proxies_count, "not_checked_proxies_count": not_checked_proxies_count, } response.update(await func(self, *args, **kwargs)) return response
async def number_of_proxies_to_process(timestamp): good_proxies_count = await db.count( Proxy.select().where( Proxy.number_of_bad_checks == 0, Proxy.next_check_time < timestamp, ) ) bad_proxies_count = await db.count( Proxy.select().where( Proxy.number_of_bad_checks > 0, Proxy.number_of_bad_checks < settings.DEAD_PROXY_THRESHOLD, Proxy.next_check_time < timestamp, ) ) dead_proxies_count = await db.count( Proxy.select().where( Proxy.number_of_bad_checks >= settings.DEAD_PROXY_THRESHOLD, Proxy.number_of_bad_checks < settings.DO_NOT_CHECK_ON_N_BAD_CHECKS, Proxy.next_check_time < timestamp, ) ) await db.create( NumberOfProxiesToProcess, timestamp=timestamp, good_proxies=good_proxies_count, bad_proxies=bad_proxies_count, dead_proxies=dead_proxies_count, )
async def process_proxies(self): while True: await asyncio.sleep(0.01) try: # check good proxies proxies = await db.execute( Proxy.select().where( Proxy.number_of_bad_checks == 0, Proxy.next_check_time < time.time(), ).order_by(Proxy.next_check_time).limit(settings.NUMBER_OF_CONCURRENT_TASKS) ) if proxies: self.good_proxies_are_processed = False await self.add_proxies_to_queue(proxies) if proxies: continue self.good_proxies_are_processed = True # check bad proxies proxies = await db.execute( Proxy.select().where( Proxy.number_of_bad_checks > 0, Proxy.number_of_bad_checks < settings.DEAD_PROXY_THRESHOLD, Proxy.next_check_time < time.time(), ).order_by(Proxy.next_check_time).limit(settings.NUMBER_OF_CONCURRENT_TASKS) ) await self.add_proxies_to_queue(proxies) if proxies: continue # check dead proxies proxies = await db.execute( Proxy.select().where( Proxy.number_of_bad_checks >= settings.DEAD_PROXY_THRESHOLD, Proxy.number_of_bad_checks < settings.DO_NOT_CHECK_ON_N_BAD_CHECKS, Proxy.next_check_time < time.time(), ).order_by(Proxy.next_check_time).limit(settings.NUMBER_OF_CONCURRENT_TASKS) ) await self.add_proxies_to_queue(proxies) except KeyboardInterrupt as ex: raise ex except BaseException as ex: self.logger.exception(ex) if settings.DEBUG: raise ex await asyncio.sleep(settings.SLEEP_AFTER_ERROR_PERIOD)
def save_proxies(url): try: r = fetch(url) except requests.exceptions.RequestException: return False addresses = re.findall(PROXY_REGEX, r.text) for address in addresses: proxy = Proxy(address=address) try: proxy.save() except NotUniqueError: pass
def add_proxy(request): urls = request.body print urls urlList = urls.split(',') rst=[] for url in urlList: try: p = Proxy(url = url, rate=0) p.save() except Exception as e: print rst.append(e.message) continue return HttpResponse(reduce(lambda x,y:x+','+y, rst))
def add_proxy(request): urls = request.body print urls urlList = urls.split(',') rst = [] for url in urlList: try: p = Proxy(url=url, rate=0) p.save() except Exception as e: print rst.append(e.message) continue return HttpResponse(reduce(lambda x, y: x + ',' + y, rst))
def load(self) -> list: ls = [] if self._num is None: return ls if self._context and self._context.logger: self._context.logger.info('SixSixIPProxySpider: loading proxy list.') url = SixSixIPProxySpider._POOL_URL.format(self._num) reg = re.compile(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)(?=<br />)') try: res = requests.get(url, proxies=self._sys_proxy, timeout=self._timeout) for match in reg.finditer(res.text): try: for protocol in ('http', 'https'): proxy = Proxy() proxy.ip = match.group(1) proxy.port = match.group(2) proxy.protocol = protocol proxy.proxy_url = self.proxy_url(proxy.ip, proxy.port, proxy.protocol) proxy.collect_time = Datetime.now() proxy.local = Config.local ls.append(proxy) except: pass return ls except: if self._context and self._context.logger: self._context.logger.exception('SixSixIPProxySpider: Failed be load proxy list.') raise
def load(self) -> list: ls = [] if self._context and self._context.logger: self._context.logger.info('FatezeroProxySpider: loading proxy list.') try: res = requests.get(FatezeroProxySpider._POOL_URL, proxies=self._sys_proxy, timeout=self._timeout) for text in res.text.split('\n'): try: p = json.loads(text, encoding='utf-8') proxy = Proxy() proxy.ip = p['host'] proxy.port = p['port'] proxy.protocol = p['type'] proxy.proxy_url = self.proxy_url(proxy.ip, proxy.port, proxy.protocol) proxy.collect_time = Datetime.now() proxy.local = Config.local ls.append(proxy) except: pass if self._num is None: return ls else: return ls[:self._num] except: if self._context and self._context.logger: self._context.logger.exception('FatezeroProxySpider: Failed be load proxy list.') raise
def add_proxy(request): if request.method == 'POST': port = request.POST.get('port', '0') destination = request.POST.get('destination', 'https://empty') delay = request.POST.get('delay', '0') p = Proxy(delay=delay, port=port, destination=destination, started=False, pid=0) p.save() new_id = p.id proxy = Proxy.objects.filter(id=new_id).values() response = [{"message": "proxy was started", "proxy": list(proxy)[0]}] return JsonResponse(response, safe=False)
def save_search_result(p, queue, retry=0): proxy = Proxy.get_random()['address'] url = SEARCH_URL.format(SEARCH_TEXT, p) try: r = fetch(url, proxy=proxy) except (Timeout, ConnectionError): sleep(0.1) retry += 1 if retry > 5: queue.put(url) raise GreenletExit() try: p = Proxy.objects.get(address=proxy) if p: p.delete() except DoesNotExist: pass return save_search_result(url, queue, retry) soup = BeautifulSoup(r.text, 'lxml') results = soup.find(class_='results') if results is None: # 此代理已经被封, 换其他的代理 sleep(0.1) retry += 1 if retry > 5: queue.put(url) raise GreenletExit() return save_search_result(url, queue, retry) articles = results.find_all( 'div', lambda x: 'wx-rb' in x) for article in articles: save_article(article)
def addProxies(s): s = s.splitlines() for i in s: try: Proxy(ip_and_port=i, proxy_type='http').save() except: pass
async def get_proxies_html(self, request): proxies = await db.execute(Proxy.select().where( Proxy.number_of_bad_checks == 0).order_by(Proxy.response_time)) proxies = list(proxies) current_timestamp = time.time() return { "proxies": [{ "address": proxy.address, "response_time": proxy.response_time / 1000 if proxy.response_time is not None else None, "uptime": datetime.timedelta(seconds=int(current_timestamp - proxy.uptime)) if proxy.uptime is not None else None, "bad_uptime": datetime.timedelta(seconds=int(current_timestamp - proxy.bad_uptime)) if proxy.bad_uptime is not None else None, "last_check_time": proxy.last_check_time, "checking_period": proxy.checking_period, "number_of_bad_checks": proxy.number_of_bad_checks, "bad_proxy": proxy.bad_proxy, "white_ipv4": proxy.white_ipv4, "location": proxy.location, } for proxy in proxies] }
async def process_proxy(self, raw_protocol: int, auth_data: str, domain: str, port: int, collector_id): async with self.proxies_semaphore: self.logger.debug( "start processing proxy {}://{}@{}:{} with collector id {}". format(raw_protocol, auth_data, domain, port, collector_id)) if auth_data is None: auth_data = "" proxy_url = "{}://".format(Proxy.PROTOCOLS[raw_protocol]) if auth_data: proxy_url += auth_data + "@" proxy_url += domain + ":" + str(port) start_checking_time = time.time() check_result, checker_additional_info = await proxy_utils.check_proxy( proxy_url) end_checking_time = time.time() if check_result: self.logger.debug("proxy {0} works".format(proxy_url)) await self.create_or_update_proxy( raw_protocol, auth_data, domain, port, start_checking_time, end_checking_time, checker_additional_info, ) else: self.logger.debug("proxy {0} doesn't work".format(proxy_url)) try: proxy = await db.get(Proxy.select().where( Proxy.raw_protocol == raw_protocol, Proxy.auth_data == auth_data, Proxy.domain == domain, Proxy.port == port, )) proxy.last_check_time = int(time.time()) proxy.next_check_time = (proxy.last_check_time + proxy.checking_period) proxy.number_of_bad_checks += 1 proxy.uptime = int(time.time()) if proxy.number_of_bad_checks >= settings.DEAD_PROXY_THRESHOLD: proxy.bad_uptime = int(time.time()) if (proxy.number_of_bad_checks == settings.DO_NOT_CHECK_ON_N_BAD_CHECKS): self.logger.debug( "proxy {} isn't checked anymore".format( proxy.to_url())) await db.update(proxy) except Proxy.DoesNotExist: pass
def save_search_result(p, queue, retry=0): proxy = Proxy.get_random()['address'] url = SEARCH_URL.format(SEARCH_TEXT, p) try: r = fetch(url, proxy=proxy) except (Timeout, ConnectionError): sleep(0.1) retry += 1 if retry > 5: queue.put(url) raise GreenletExit() try: p = Proxy.objects.get(address=proxy) if p: p.delete() except DoesNotExist: pass return save_search_result(url, queue, retry) soup = BeautifulSoup(r.text, 'lxml') results = soup.find(class_='results') if results is None: # 此代理已经被封, 换其他的代理 sleep(0.1) retry += 1 if retry > 5: queue.put(url) raise GreenletExit() return save_search_result(url, queue, retry) articles = results.find_all('div', lambda x: 'wx-rb' in x) for article in articles: save_article(article)
async def get_best_http_proxy(self, request): proxy_address = (await db.get(Proxy.select().where( Proxy.number_of_bad_checks == 0, Proxy.raw_protocol == Proxy.PROTOCOLS.index("http"), ).order_by(Proxy.response_time))).address return web.Response(text=proxy_address)
async def get_proxies_for_id(self, data: dict) -> dict: validate_dict_must_have_key(data, 'id') validate_dict_must_have_key(data, 'number') number = int(data['number']) validate_uint(number) # TODO: validate id results = [] for item in await db.execute( Proxy.raw( f'SELECT * FROM working_proxies TABLESAMPLE SYSTEM_ROWS({number});' )): obj = {} for field_name in settings.PROXY_PROVIDER_SERVER_API_CONFIG_FETCH_CONFIG[ 'fields']: obj[field_name] = getattr(item, field_name) results.append(obj) return { "number_of_results": len(results), "results": results, }
def load_proxylist(self): if not self.scrappers: return proxylist = set() for scrapper in self.scrappers: try: proxylist.update(scrapper.scrap()) except Exception as e: log.exception('%s proxy scrapper failed: %s', type(scrapper).__name__, e) log.info('%s scrapped a total of %d proxies.', type(self).__name__, len(proxylist)) proxylist = self.__parse_proxylist(proxylist).values() Proxy.insert_new(proxylist)
async def create_proxy_count_item(timestamp): good_proxies_count = await db.count( Proxy.select().where(Proxy.number_of_bad_checks == 0)) bad_proxies_count = await db.count(Proxy.select().where( Proxy.number_of_bad_checks > 0, Proxy.number_of_bad_checks < settings.DEAD_PROXY_THRESHOLD, )) dead_proxies_count = await db.count(Proxy.select().where( Proxy.number_of_bad_checks >= settings.DEAD_PROXY_THRESHOLD)) await db.create( ProxyCountItem, timestamp=timestamp, good_proxies_count=good_proxies_count, bad_proxies_count=bad_proxies_count, dead_proxies_count=dead_proxies_count, )
def filte(content): soup = BeautifulSoup(content) proxy_list_info = soup.findAll('tr') proxy_list = [] for proxy in proxy_list_info: td_index = 0 proxy_tds = proxy.findAll('td') has_get = False proxy = Proxy() for proxy_td in proxy_tds: td_index += 1 if td_index == 2: has_get = True proxy.ip = proxy_td.text elif td_index == 3: proxy.port = proxy_td.text elif td_index == 4: if not proxy_td.a == None: proxy.location = proxy_td.a.text elif td_index == 5: proxy.anonymous_type = proxy_td.text elif td_index == 6: proxy.proxy_type = proxy_td.text.lower() if has_get: proxy_list.append(proxy) return proxy_list
async def add_proxy_to_queue(self, proxy: Proxy, collector_id=None): async with self.proxies_semaphore: asyncio.ensure_future(self.process_proxy( proxy.get_raw_protocol(), proxy.auth_data, proxy.domain, proxy.port, collector_id, ))
def create_or_update_proxy(raw_protocol: Proxy.PROTOCOLS, auth_data, domain, port, start_checking_time, end_checking_time): if raw_protocol is None or domain is None or port is None or auth_data is None or start_checking_time is None\ or end_checking_time is None: raise Exception("Bad arguments") if raw_protocol < 0 or raw_protocol >= len(Proxy.PROTOCOLS): raise Exception("Bad protocol") response_time = int( round((end_checking_time - start_checking_time) * 1000000)) proxy = session.query(Proxy).filter( sqlalchemy.and_(Proxy.raw_protocol == raw_protocol, Proxy.auth_data == auth_data, Proxy.domain == domain, Proxy.port == port)).first() if proxy: # exists, so update pass else: # doesn't exist, so create proxy = Proxy(number_of_bad_checks=0, raw_protocol=raw_protocol, auth_data=auth_data, domain=domain, port=port) session.add(proxy) if proxy.bad_proxy or proxy.uptime is None or proxy.uptime == 0: proxy.uptime = int(time.time()) if proxy.bad_uptime is None or proxy.bad_uptime == 0 or \ proxy.number_of_bad_checks > settings.DEAD_PROXY_THRESHOLD: proxy.bad_uptime = int(time.time()) proxy.response_time = response_time proxy.number_of_bad_checks = 0 proxy.last_check_time = int(time.time()) checking_time = int(end_checking_time - start_checking_time) if checking_time > settings.PROXY_CHECKING_TIMEOUT: checking_time = settings.PROXY_CHECKING_TIMEOUT proxy.checking_period = \ settings.MIN_PROXY_CHECKING_PERIOD \ + (checking_time / settings.PROXY_CHECKING_TIMEOUT) \ * (settings.MAX_PROXY_CHECKING_PERIOD - settings.MIN_PROXY_CHECKING_PERIOD) session.commit()
def use_thread_with_queue2(): cleanup() # in_queue = Queue.Queue() # out_queue = Queue.Queue() # # for i in range(5): # t = threading.Thread(target=save_proxies_with_queue2, # args=(in_queue, out_queue)) # t.setDaemon(True) # t.start() # # for url in PROXY_SITES: # in_queue.put(url) # # result = [] # # for i in range(5): # t = threading.Thread(target=append_result, # args=(out_queue, result)) # t.setDaemon(True) # t.start() # # in_queue.join() # out_queue.join() addresses = [] # mogu_key = "" # res = requests.get(mogu_key) # addresses = res.json()['msg'] for address in addresses: proxy = Proxy(address=address['ip'] + ':' + address['port']) try: proxy.save() except NotUniqueError: pass pool = Pool(10) pool.map(check_proxy, Proxy.objects.all()) print(len(addresses)) print(Proxy.objects.count())
def save_proxies(url): proxies = [] try: # if url == 'http://www.kuaidaili.com/free': # import pdb; # pdb.set_trace() res = requests.get(url) except requests.exceptions.RequestException: return False addresses = re.findall(PROXY_REGEX, res.text) for address in addresses: proxy = Proxy(address=address) try: proxy.save() except NotUniqueError: pass else: proxies.append(address) return proxies
def filte(content): soup = BeautifulSoup(content) proxy_list_info = soup.findAll('tr') proxy_list = [] for proxy in proxy_list_info: td_index = 0 proxy_tds = proxy.findAll('td') has_get = False proxy = Proxy(); for proxy_td in proxy_tds: td_index += 1 if td_index == 2: has_get = True proxy.ip = proxy_td.text elif td_index == 3: proxy.port = proxy_td.text elif td_index == 4: if not proxy_td.a == None: proxy.location = proxy_td.a.text elif td_index == 5: proxy.anonymous_type = proxy_td.text elif td_index == 6: proxy.proxy_type = proxy_td.text.lower() if has_get: proxy_list.append(proxy) return proxy_list
def fetch(url): s = requests.Session() s.headers.update({'user-agent': get_user_agent()}) proxies = { 'http': Proxy.get_random()['address'], } html_text = s.get(url, timeout=TIMEOUT, proxies=proxies).text js_url = gen_js_url(url) try: js_data = s.get(js_url, timeout=TIMEOUT, proxies=proxies).json() except JSONDecodeError: raise RequestException() return html_text, js_data
def __execute_one_spider_task(self, spider): """ 一次执行具体爬虫的任务 :param spider: :return: """ try: for proxy in spider.get_proxies(): proxy = check_proxy(proxy) if proxy.speed != -1: session = Session() exist = session.query(Proxy) \ .filter(Proxy.ip == str(proxy.ip), Proxy.port == str(proxy.port)) \ .first() if not exist: obj = Proxy( ip=str(proxy.ip), port=str(proxy.port), protocol=proxy.protocol, nick_type=proxy.nick_type, speed=proxy.speed, area=str(proxy.area), score=proxy.score, disable_domain=proxy.disable_domain, origin=str(proxy.origin), create_time=datetime.now() ) session.add(obj) session.commit() session.close() logger.info(f'insert: {proxy.ip}:{proxy.port} from {proxy.origin}!') else: exist.score['score'] = settings.MAX_SCORE exist.score['power'] = 0 exist.port = proxy.port exist.protocol = proxy.protocol exist.nick_type = proxy.nick_type exist.speed = proxy.speed exist.area = proxy.area exist.disable_domain = proxy.disable_domain exist.origin = proxy.origin session.commit() session.close() logger.info(f'update: {proxy.ip}:{proxy.port}, to max score successfully!') else: logger.info(f'invalid: {proxy.ip}:{proxy.port} from {proxy.origin}!') except Exception as e: logger.error(f'spider error: {e}')
def __update_proxy(self, proxy, valid=False): proxy['scan_date'] = datetime.utcnow() if valid: proxy['fail_count'] = 0 self.stats['valid'] += 1 self.stats['total_valid'] += 1 else: proxy['fail_count'] += 1 self.stats['fail'] += 1 self.stats['total_fail'] += 1 proxy = Proxy.db_format(proxy) with self.proxy_updates_lock: self.test_hashes.remove(proxy['hash']) self.proxy_updates[proxy['hash']] = proxy
async def process_raw_proxy(self, proxy, collector_id): self.logger.debug("processing raw proxy \"{}\"".format(proxy)) try: _, auth_data, domain, port = proxy_validator.retrieve(proxy) except proxy_validator.ValidationError as ex: self.collectors_logger.error( "Collector with id \"{}\" returned bad raw proxy \"{}\". " "Message: {}".format(collector_id, proxy, ex) ) return # don't care about protocol try: proxy = await db.get( Proxy.select().where( Proxy.auth_data == auth_data, Proxy.domain == domain, Proxy.port == port, ) ) if proxy.last_check_time + settings.PROXY_NOT_CHECKING_PERIOD >= time.time(): proxy_short_address = "" if auth_data: proxy_short_address += auth_data + "@" proxy_short_address += "{}:{}".format(domain, port) self.logger.debug( "skipping proxy \"{}\" from collector \"{}\"".format( proxy_short_address, collector_id) ) return except Proxy.DoesNotExist: pass for raw_protocol in range(len(Proxy.PROTOCOLS)): while not self.good_proxies_are_processed: # TODO: find a better way await asyncio.sleep(0.1) new_proxy = Proxy() new_proxy.raw_protocol = raw_protocol new_proxy.auth_data = auth_data new_proxy.domain = domain new_proxy.port = port await self.add_proxy_to_queue(new_proxy, collector_id)
def save_search_result(page, queue, retry=0): proxy = Proxy.get_random()['address'] url = SEARCH_URL.format(SEARCH_TEXT, page) try: r = fetch(url, proxy=proxy) except (Timeout, ConnectionError, IOError): sleep(0.1) retry += 1 if retry > 5: put_new_page(page, queue) raise GreenletExit() try: p = Proxy.objects.get(address=proxy) if p: p.delete() except DoesNotExist: pass return save_search_result(page, queue, retry) soup = BeautifulSoup(r.text, 'lxml') results = soup.find(class_='results') if results is None: # 此代理已经被封, 换其他的代理 sleep(0.1) retry += 1 if retry > 5: put_new_page(page, queue) print 'retry too much!' raise GreenletExit() return save_search_result(page, queue, retry) articles = results.find_all( 'div', lambda x: 'wx-rb' in x) for article in articles: save_article(article) page_container = soup.find(id='pagebar_container') if page_container and u'下一页' in page_container.text: last_page = int(page_container.find_all('a')[-2].text) current_page = int(page_container.find('span').text) for page in range(current_page + 1, last_page + 1): put_new_page(page, queue)
def save_search_result(page, queue, retry=0): proxy = Proxy.get_random()['address'] url = SEARCH_URL.format(SEARCH_TEXT, page) try: r = fetch(url, proxy=proxy) except (Timeout, ConnectionError, IOError): sleep(0.1) retry += 1 if retry > 5: put_new_page(page, queue) raise GreenletExit() try: p = Proxy.objects.get(address=proxy) if p: p.delete() except DoesNotExist: pass return save_search_result(page, queue, retry) soup = BeautifulSoup(r.text, 'lxml') results = soup.find(class_='results') if results is None: # 此代理已经被封, 换其他的代理 sleep(0.1) retry += 1 if retry > 5: put_new_page(page, queue) print 'retry too much!' raise GreenletExit() return save_search_result(page, queue, retry) articles = results.find_all('div', lambda x: 'wx-rb' in x) for article in articles: save_article(article) page_container = soup.find(id='pagebar_container') if page_container and u'下一页' in page_container.text: last_page = int(page_container.find_all('a')[-2].text) current_page = int(page_container.find('span').text) for page in range(current_page + 1, last_page + 1): put_new_page(page, queue)
def filte(content): soup = BeautifulSoup(content) proxy_list_tables = soup.findAll('table') table_index = 0 pattern = re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') proxy_list = [] for table in proxy_list_tables: table_index += 1 if table_index == 3: proxy_list_info = table.findAll('tr') for proxy in proxy_list_info: td_index = 0 proxy_tds = proxy.findAll('td') proxy = Proxy() is_proxy = False for proxy_td in proxy_tds: td_index += 1 if td_index == 2: rel_ip_info = re.search(pattern, proxy_td.text) if rel_ip_info: proxy.ip = rel_ip_info.group(0) is_proxy = True elif td_index == 3: if is_proxy: proxy.port = int(proxy_td.text) elif td_index == 4: if is_proxy: if '匿名代理' == proxy_td.text or '高度匿名' == proxy_td.text: proxy.anonymous_type = '高匿' else: proxy.anonymous_type = '透明' elif td_index == 5: if is_proxy: proxy.location = proxy_td.text proxy.proxy_type = 'http' if is_proxy: proxy_list.append(proxy) return proxy_list
async def fetch(url, retry=0): proxy = 'http://{}'.format(Proxy.get_random()['address']) headers = {'user-agent': get_user_agent()} conn = aiohttp.ProxyConnector(proxy=proxy) js_url = gen_js_url(url) try: with aiohttp.ClientSession(connector=conn) as session: with aiohttp.Timeout(TIMEOUT): async with session.get(url, headers=headers) as resp: html_text = await resp.text() async with session.get(js_url, headers=headers) as resp: js_data = await resp.json() except: retry += 1 if retry > 5: raise CrawlerError() await asyncio.sleep(1) return await fetch(url, retry=retry) return html_text, js_data
def set_proxy(bot, update, args): if len(args) == 1: proxy_data = args[0].split(':') if len(proxy_data) != 4: update.message.reply_text( "Please, include the proxy data to the " "command, like in the example:\n" "<code>/set_proxy mydomain.com:8080:usErnAme:s3cret</code>", parse_mode=ParseMode.HTML) return proxy_ip, proxy_port, proxy_username, proxy_password = proxy_data current_proxy = session.query(Proxy).first() if current_proxy: session.delete(current_proxy) new_proxy = Proxy(proxy_ip, proxy_port, proxy_username, proxy_password) session.add(new_proxy) session.commit() update.message.reply_text("Proxy settings updated.") else: update.message.reply_text( "Please, include the proxy data to the " "command, like in the example:\n" "<code>/set_proxy mydomain.com:8080:usErnAme:s3cret</code>", parse_mode=ParseMode.HTML)
async def fetch(retry=0): proxy = 'http://{}'.format(Proxy.get_random()['address']) headers = {'user-agent': get_user_agent()} conn = aiohttp.ProxyConnector(proxy=proxy) url = 'http://httpbin.org/ip' try: with aiohttp.ClientSession(connector=conn) as session: with aiohttp.Timeout(TIMEOUT): async with session.get(url, headers=headers) as resp: return await resp.json() except (ProxyConnectionError, TimeoutError): try: p = Proxy.objects.get(address=proxy) if p: p.delete() except DoesNotExist: pass retry += 1 if retry > 5: raise TimeoutError() await asyncio.sleep(1) return await fetch(retry=retry)
def filte(content): soup = BeautifulSoup(content) proxy_list_tables = soup.findAll('table') table_index = 0 pattern = re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') proxy_list = [] for table in proxy_list_tables: table_index += 1 if table_index == 3: proxy_list_info = table.findAll('tr') for proxy in proxy_list_info: td_index = 0 proxy_tds = proxy.findAll('td') proxy = Proxy(); is_proxy = False for proxy_td in proxy_tds: td_index += 1 if td_index == 2: rel_ip_info = re.search(pattern, proxy_td.text) if rel_ip_info: proxy.ip = rel_ip_info.group(0) is_proxy = True elif td_index == 3: if is_proxy: proxy.port = int(proxy_td.text) elif td_index == 4: if is_proxy: if '匿名代理' == proxy_td.text or '高度匿名' == proxy_td.text: proxy.anonymous_type = '高匿' else: proxy.anonymous_type = '透明' elif td_index == 5: if is_proxy: proxy.location = proxy_td.text proxy.proxy_type = 'http' if is_proxy: proxy_list.append(proxy) return proxy_list
return (False, 0) except: return (False, 0) def check_google(proxy_info): proxy_content = proxy_info.ip + ':' + str(proxy_info.port) proxy = urllib2.ProxyHandler({proxy_info.proxy_type : proxy_content}) opener = urllib2.build_opener(proxy) urllib2.install_opener(opener) try: time1 = time.time() response = urllib2.urlopen(GOOGLE_CHECK_URL, timeout=3) title = BeautifulSoup(response.read()).title.text if 'Google' == str(title): proxy_info.check_time = str(datetime.now()).split('.')[0] return (True, (time.time() - time1) * 1000) else: return (False, 0) except: return (False, 0) if __name__ == '__main__': proxy = Proxy() proxy.ip = '222.74.6.48' proxy.port = '8000' proxy.proxy_type = 'http' default_ip = get_default_ip() print check_anonymous(proxy, default_ip)
def cleanup(): Proxy.drop_collection()
return (False, 0) except: return (False, 0) def check_google(proxy_info): proxy_content = proxy_info.ip + ":" + str(proxy_info.port) proxy = urllib2.ProxyHandler({proxy_info.proxy_type: proxy_content}) opener = urllib2.build_opener(proxy) urllib2.install_opener(opener) try: time1 = time.time() response = urllib2.urlopen(GOOGLE_CHECK_URL, timeout=3) title = BeautifulSoup(response.read()).title.text if "Google" == str(title): proxy_info.check_time = str(datetime.now()).split(".")[0] return (True, (time.time() - time1) * 1000) else: return (False, 0) except: return (False, 0) if __name__ == "__main__": proxy = Proxy() proxy.ip = "222.74.6.48" proxy.port = "8000" proxy.proxy_type = "http" default_ip = get_default_ip() print check_anonymous(proxy, default_ip)