def get_proxy(): pool = ProxyPool.get_proxy_pool() while pool: proxy = random.choice(pool) if ProxyPool.is_valid_proxy(proxy): return proxy ProxyPool.expire(proxy) pool = ProxyPool.get_proxy_pool() logging.error("@get_proxy Error: no available proxy.")
def run_proxy_pool_service(**kwargs): proxy_pool = ProxyPool(kwargs) proxy_pool.load() logger.debug("proxy_pool load ok, count = %d", proxy_pool.count()) from thrift.transport import TSocket, TTransport from thrift.server import TServer from thrift.protocol import TBinaryProtocol handler = ProxyPoolHandler(proxy_pool) import proxymaid_rpc.rpc.ProxyPool processor = proxymaid_rpc.rpc.ProxyPool.Processor(handler) import proxymaid_rpc.settings transport = TSocket.TServerSocket(port=proxymaid_rpc.settings.PROXY_POOL_LISTEN_PORT) tfactory = TTransport.TBufferedTransportFactory() pfactory = TBinaryProtocol.TBinaryProtocolFactory() server = TServer.TThreadedServer(processor, transport, tfactory, pfactory) start_proxy_web(proxy_pool ,9100) logger.debug_fun("start proxy pool service ok") server.serve()
def start_spider(sort, numThreads, numProxyThreads): pname = multiprocessing.current_process().name # task queue taskQueue = Queue.Queue() [SpiderThread(taskQueue).start() for _ in range(numThreads)] print('{}: {} threads created.'.format(pname, threads)) # add tasks (producers) while True: idList = get_id_list(sort) num = len(idList) if num == 0: break print('{}: {} rows need to update.'.format(pname, num)) genProxy = ProxyPool(numProxyThreads).gen_proxy() [taskQueue.put((i, sort, genProxy.next())) for i in idList] print('{}: {} tasks added!'.format(pname, num)) # wait taskQueue.join()
def get_ip_from_proxiesPool(): proxypool = ProxyPool(redis_host, redis_port) is_ip = 0 while not is_ip: try: t = proxypool.pop_ip()[0] ip = t[0].decode('utf-8') timestamp = t[1] if time.time() - timestamp > 60: print('ip too old') time.sleep(1) else: is_ip = 1 except Exception as e: print('ip proxy is empty ,wait.....', e) time.sleep(3) print(ip) return ip, timestamp
from proxy import ProxyPool from setting import api,redis_host,redis_port,min_ip proxypool = ProxyPool(redis_host, redis_port) proxypool.mother(api, min_ip)
def thread_task(task_name, user_id, json_config): queue = '.'.join([md5(PROJECT_NAME), str(user_id), md5(task_name)]) redis_db = redis.StrictRedis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD) try: if task_name not in TASK_LIST: return logger.info('开始监听任务%s,队列key为%s' % (task_name, queue)) while True: task = redis_db.blpop(queue + '.TASK', timeout=2) if not task: if task_name in TASK_LIST: logger.info('task: %s,queue: %s time out' % (task_name, queue)) continue else: break url = task[1] try: # 判断是否使用代理 if json_config.get('USE_PROXY', 0) == 0: from proxy import ProxyPool requests = ProxyPool(key='927200326076035', debug=True) else: import requests # 判断是否使用随机请求头 if json_config.get('RANDOM_AU', 0) == 0: # print 'RANDOM AU' pass resp = requests.get(url) logger.info('GET %s' % url) task_res = {} # 解析结果内容 for res in json_config.get('RESULT', []): key = res[0] rex = res[1] if rex == 'ALL': value = resp.content else: value = re.findall(rex, resp.content)[0] task_res[key] = value # 解析下一个url for task_rex in json_config.get('TASK_REX', []): next_urls = re.findall(task_rex, resp.content) for next_url in next_urls: # 判断是否已经存在该任务 if redis_db.sismember(queue + '.ALL', next_url): continue redis_db.lpush(queue + '.ALL', next_url) redis_db.lpush(queue + '.TASK', next_url) # 将结果存入结果队列 logger.info('OK %s' % url) redis_db.lpush(queue + '.OK', url) redis_db.lpush(queue + '.RESULT', json.dumps(task_res)) except Exception, e: logger.info('ERR %s %s' % (url, str(e))) # 将错误存入错误队列 error_msg = { 'task_name': task_name, 'url': url, 'simple_msg': str(e), 'more_msg': traceback.format_exc() } redis_db.lpush(queue + '.ERROR', json.dumps(error_msg)) except Exception, e: logger.info('任务出现错误 %s %s' % (task_name, str(e))) # 将错误存入错误队列 error_msg = { 'task_name': task_name, 'simple_msg': str(e), 'more_msg': traceback.format_exc() } redis_db.lpush(queue + '.ERROR', json.dumps(error_msg))
def __init__(self, loop): self.loop = loop self.proxy_pool = ProxyPool(loop)
class Crawler: headers = { 'charset': "utf-8", 'platform': "4", "referer": "https://servicewechat.com/wx40f112341ae33edb/1/", 'content-type': "application/x-www-form-urlencoded", 'user-agent': "MicroMessenger/6.5.4.1000 NetType/WIFI Language/zh_CN", 'host': "mwx.mobike.com", 'connection': "Keep-Alive", 'accept-encoding': "gzip", 'cache-control': "no-cache" } mobike_url = "https://mwx.mobike.com/mobike-api/rent/nearbyBikesInfo.do" data_format = "latitude={}&longitude={}&errMsg=getMapCenterLocation" left = 30.7828453209 top = 103.9213455517 right = 30.4781772402 bottom = 104.2178123382 offset = 0.002 def __init__(self, loop): self.loop = loop self.proxy_pool = ProxyPool(loop) async def get_bike(self, lat, lon): proxy = self.proxy_pool.pick() async with sema: with async_timeout.timeout(5, loop=self.loop): try: async with aiohttp.request( method='POST', url=self.mobike_url, headers=self.headers, data=self.data_format.format(lat, lon), proxy=proxy.url, ) as resp: ret = await resp.json() if ret: logger.info("success proxy: %s", proxy.url) self.save(ret) except Exception as e: proxy.error() logger.error("get bike error: %s, lat: %s, lon: %s", str(e), str(lat), str(lon)) finally: self.total += 1 logger.info("success: %s", str(self.total)) # sema.release() def save(self, ret): for item in ret['object']: BikeLocation.new_location(item) async def run(self): logger.info("start") self.total = 1 start = int(time.time()) lat_range = np.arange(self.left, self.right, -self.offset) future_list = [] for lat in lat_range: lon_range = np.arange(self.top, self.bottom, self.offset) for lon in lon_range: future_list.append(asyncio.Task(self.get_bike(lat, lon))) for task in future_list: await task logger.info("get %s", str(self.total)) logger.info("done cast: %s", str(int(time.time()) - start)) logger.info("proxys: %s", str(self.proxy_pool.proxies))
from Writer import Writer import threading import traceback from bs4 import BeautifulSoup from proxy import KDLProxyFinder, ProxyPool from pybloom import BloomFilter basePageURL = 'https://www.proginn.com/' baseUserURL = 'https://www.proginn.com/wo/' userQueue = queue.Queue() writer = Writer() working = 1 myFilter = BloomFilter(capacity=150000, error_rate=0.0000001) finder = KDLProxyFinder("http://www.kuaidaili.com/free/inha/") # finder = XiciProxyFinder("http://www.xicidaili.com/wn/") ppool_instance = ProxyPool(finder) ppool_instance.get_proxies() # headers = { # 'Cookie': 'UM_distinctid=15f80e3cfdd43-08fecc7b7f450d-8383667-100200-15f80e3cfde45c; client_id=5a3bb7e3dc6b2; x_access_token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1aWQiOiI4ODI2OCIsImN0aW1lIjoxNTEzODYzMTkwLCJleHAiOjE1MTY0NTUxOTB9.AkbrWfEjN82MoOZKZC1QJ_Dif4y0ySPxKaZDWGPiDcw; 48421=48421; XSRF-TOKEN=eyJpdiI6ImpJSXRlU3VMSnNzT3puZHlVdWdCWGc9PSIsInZhbHVlIjoiU1lJV0dWdEVqcHduMjlFMkZIOVBXSXc0VjlHNmcwMjF3QjRRZ1dKRVNrbkNOXC9mMWZwblQyTExvZzlPXC9kWkM4cXBZRjFpTVJjaU9YaTJJa1QzNlV1QT09IiwibWFjIjoiZGI5MGY3YWE1MmJhZmJlZmRmMjdiYTA5YjM2MDI3NzliY2M4ZmY3MDE3MjQyODQ0NzU0ODEyZGZlZGMxMGNiNSJ9; laravel_session=eyJpdiI6IjlvY0Iya3dnSDZ2ZDJoUktORlhZM1E9PSIsInZhbHVlIjoia1BoYlQwejJ4QVFNTXgxaE5mS05qRjR5TzIzZUVkQURkNDd5dU5GcjVvVTFvYWdLXC8wR1dLNUxCNXBwMUJqeE80ZFExRURqUDU4K05BOTVickJsOHZBPT0iLCJtYWMiOiIyNjNhMjhkYmQzMzdkNTU4NGRjMjE5YmU1NGQ0ZWZkNWE2YTNlNTEyN2E2YmM4M2ZhZjJjYTdhNmQ2ZWFiOTllIn0%3D; fed34112b737c4245348174f9344d1d3b8814ad4=eyJpdiI6IkU1R2piV1lVeWJFaWZhTXlsdTk4UEE9PSIsInZhbHVlIjoicEQyMnd0UnQ4TkpMZlEwYTNDbVlNWENteWVyajVBYzI3dnFtME5ud3pBK1JyVGxpVUxSTUtlcGJ6SzJQVGlWQnlpczB4VXZCTW45VFZpd1F1ZU5MNHZBYUFzdUFGeGE0VExGc3ExQmRKcHBicGU0Vlh5bXIram5XaGdVMUMrOWxxaWkzRFVldEFWdmlMK01LSlZJa1VqYU82MkplRndcL2creEdjaEZoY1ltbVlLTzFwQloyVGFFZjdFWFN6M3VISGNUKzdwXC9mbzJzVGxQQk44dzBDcTNkSis2T2FJZnhcL25mUlBPVHZhRHBXeFpOaFwvSTFiSlZiUEhIVDdSTjZ5bGwxSUpwZmpweklEekZQZkdGSzNZdVFwaUtMV2RYOEdzMlZHMHNpNUZvXC94R1pGNFRqMHozYkdNaFlMMWJkT2ZaVG9waGF2ZjNwUnpBbDFLc3BFMDIwVWVxWmRNSGZCM2ZnZWhRTnlcL3hueUJBV3FWWkVVa3g0SkNzRXRoTnFvYVpkNWtpQnR6RGI1Q1RJUVR0T0dJbTFSUCtXTVNlTjFTUzJrWElyQlBZeE1KRW94R1dybGJzSTl1WVwvZlNydE9lenpERjUxNGJsM0pCaE5wSDNpVXNoNlNtSkJaSmtcL1NwanVFdFdrQmtxd0diUT0iLCJtYWMiOiJhOWIxZjEzOGE4YmI0MjM1Mzc3ODYzYzkxYzViMTIyYjUxZmVkZWExMjkwYzQxN2VjODg1ZmUyY2U0ZmM0MWI3In0%3D; CNZZDATA1261469621=1194602006-1509690624-%7C1513924663; Hm_lvt_c92adf6182a39eb23c24cf43abc3f439=1512022434,1513863024; Hm_lpvt_c92adf6182a39eb23c24cf43abc3f439=1513925804', # 'Referer': 'https://www.proginn.com/wo/60153' # } def handleHTML(html, user): try: soup = BeautifulSoup(html) tag = soup.find(class_='nickname') user.nickname = tag.a.string # tag = soup.find(class_='introduction') # if(len(tag.contents)==3): # user.city = "未知"
#-*-coding:utf-8-*- import threading import traceback from bs4 import BeautifulSoup from proxy import KDLProxyFinder, ProxyPool import requests baseurl = "http://www.plantphoto.cn/class" r = requests.get(baseurl) html = r.text # soup = BeautifulSoup(html,"html.parser") finder = KDLProxyFinder("http://www.kuaidaili.com/free/inha/") ppool_instance = ProxyPool(finder) ppool_instance.get_proxies() r = requests.get(baseurl, proxies=ppool_instance.get_one_proxy()) html = r.text print html requests.request()
def muti_control(id_list): pool = ThreadPool(7) proxypool = ProxyPool(redis_host, redis_port) #模拟登陆 s = requests.Session() is_login = 0 while not is_login: ip, timestamp = get_ip_from_proxiesPool() is_login = login(s, ip) #time.sleep(10) print('suss login') proxies = { 'http': 'http://{}'.format(ip), 'https': 'https://{}'.format(ip) } requests.post( url= 'http://www.pss-system.gov.cn/sipopublicsearch/patentsearch/pageIsUesd-pageUsed.shtml', proxies=proxies, cookies=s.cookies) print('tou ming zhuang ') while True: #用于记录ip使用次数 #datetime_info = get_data_from_datetime(threads, sub_duty[0], sub_duty[-1]) datetime_info = get_data_from_datetime(10, id_list) if len(datetime_info) == 0: print(time.asctime()) break else: task = [list(t) for t in datetime_info] for i in task: i.append(s.cookies) i.append(ip) result = pool.map(spider, task) #根据返回的结果判断是否需要换Ip、cookies needIp = 0 needCookies = 0 for r in result: #id = r[0] #status = r[1] #content = r[2] if r[1] == 1: #正常的数据,写入数据库 write_db(r[0], r[2]) elif r[1] == 2: #cookies, print(r[0], 'cookies wrong') needCookies = 1 needIp = 1 elif r[1] == 3: #ip过期 print(r[0], 'ip wrong') needIp = 1 #根据结果 更新session的状态 if needCookies: is_login = 0 while not is_login: print('login again...') ip, timestamp = get_ip_from_proxiesPool() is_login = login(s, ip) requests.post( url= 'http://www.pss-system.gov.cn/sipopublicsearch/patentsearch/pageIsUesd-pageUsed.shtml', proxies=proxies, cookies=s.cookies) print('tou ming zhuang ') proxypool.push_ip(ip, timestamp) elif needIp: while True: ip, timestamp = get_ip_from_proxiesPool() print('renew ip ...') proxies = { 'http': 'http://{}'.format(ip), 'https': 'https://{}'.format(ip) } try: rsp_ipTest = requests.get( 'http://www.pss-system.gov.cn/sipopublicsearch/portal/uiIndex.shtml', proxies=proxies, timeout=10) if rsp_ipTest.status_code == 200 and '访问受限' not in rsp_ipTest.text: try: requests.post( url= 'http://www.pss-system.gov.cn/sipopublicsearch/patentsearch/pageIsUesd-pageUsed.shtml', proxies=proxies, cookies=s.cookies, timeout=7) print('tou ming zhuang ') break except: pass except: pass