def get_proxies(): """ this function is to get proxies from publishing websites. Used python module trip, which is a combination of requests and tornado, can handle requests with multi-coroutine. We have to be careful of the max TCP packages that the machine can send, cause this function consume all TCP packages sending quantity. """ global ITL#used to iterate URL_SET this_web_has = 0#the total number of proxies this web page has. ITL += 1 #re for IP and Port infomation, eg"127.0.0.1:8080". pi = r'(?:((?:\d|[1-9]\d|1\d{2}|2[0-5][0-5])\.(?:\d|[1-9]\d|1\d{2}|2[0-5][0-5])\.(?:\d|[1-9]\d|1\d{2}|2[0-5][0-5])\.(?:\d|[1-9]\d|1\d{2}|2[0-5][0-5]))\D+?(6[0-5]{2}[0-3][0-5]|[1-5]\d{4}|[1-9]\d{1,3}|[0-9]))' ipandportlist = [] print(URL_SET[ITL]) try: r = yield trip.get(URL_SET[ITL], timeout=30, headers=header)#used trip. p = re.findall(pi, r.text) for each in p: str1 = str(each) str1 = str1.replace(',', ':') str1 = str1.replace('(', '') str1 = str1.replace(')', '') str1 = str1.replace('\'', '') str1 = str1.replace(' ', '') ipandportlist.append(str1) this_web_has += 1 except Exception as detail: print('This website has a problem.', detail) get_proxies() print(URL_SET[ITL]+' has '+str(this_web_has)+' proxies.') return ipandportlist
def get_proxies(): """ 爬代理网站的函数 """ global ALL global ITL #协助遍历URL_SET的常数,因为需要在这个函数里改动,所以需要在这里global声明一下。 this_web_has = 0 #用来记录这个网站有多少个代理的信息,记录所有的代理数量,包括不好用的。 ITL += 1 #用来遍历URL_SET。 #下面这句是正则表达式,能找到IP和端口信息。 pi = r'(?:((?:\d|[1-9]\d|1\d{2}|2[0-5][0-5])\.(?:\d|[1-9]\d|1\d{2}|2[0-5][0-5])\.(?:\d|[1-9]\d|1\d{2}|2[0-5][0-5])\.(?:\d|[1-9]\d|1\d{2}|2[0-5][0-5]))\D+?(6[0-5]{2}[0-3][0-5]|[1-5]\d{4}|[1-9]\d{1,3}|[0-9]))' ipandportlist = [] #装找到的代理,回头会返回。 print(URL_SET[ITL]) try: r = yield trip.get(URL_SET[ITL], timeout=200, headers=header) #print(r.text) p = re.findall(pi, r.text) for each in p: str1 = str(each) str1 = str1.replace(',', ':') str1 = str1.replace('(', '') str1 = str1.replace(')', '') str1 = str1.replace('\'', '') str1 = str1.replace(' ', '') ALL.append(str1) this_web_has += 1 except Exception as detail: print('This website has a problem.', detail) get_proxies() print(URL_SET[ITL] + ' has ' + str(this_web_has) + ' proxies.')
def test_proxy(proxy): try: r = yield trip.get('http://httpbin.org/get', timeout=5, proxies={ 'http': proxy, 'https': proxy }) if 'httpbin.org' not in r.text: raise Exception('Invalid reply') except Exception as e: pass else: raise trip.Return(proxy)
def requests_a(url, square_, type_): global result try: result = yield trip.get(url) except Exception as e: return print('-----------------{}'.format(result.text)) if 'DAILY_QUERY_OVER_LIMIT' in result.text: sys.exit() try: status = result.json()['status'] except Exception as e: print(e) print(result) return if status is '1': count = int(result.json()['count']) if count != 0: if count < 50: print('count < 50') connection_result = r_result.connection channel = connection_result.channel() channel.queue_declare(queue='amap_result_json') channel.basic_publish(exchange='', routing_key='amap_result_json', body=json.dumps(result.json())) channel.close() print(result.json()) connection_result.close() else: print('count > 50') connection_page = r_page.connection channel = connection_page.channel() channel.queue_declare(queue='amap_page_url') for i in range(1, int(count / 50 + 0.5)): channel.basic_publish( exchange='', routing_key='amap_page_url', body='http://restapi.amap.com/v3/place/polygon?polygon=' + square_ + ';&types=' + type_ + '&output=JSON&offset=50' + '&page=' + str(i + 1), ) print('分页 的url放入') channel.close() # connection_page.close() else: print('count = 0') return else: return
def asyn_message(body,api_key): body = body.decode('utf8') url_list = [] for i in json.loads(body): s, t = i['square_list'], i['type'] square_ = str(s[0]) + ',' + str(s[1]) + ';' + str(s[2]) + ',' + str(s[3]) url_list.append('http://restapi.amap.com/v3/place/polygon?polygon=' + square_ + ';&types=' + t + '&output=JSON&key=' + \ api_key + '&offset=50') r = yield [trip.get(url) for url in url_list] for result in r: requests_a(result)
def test_proxy(proxy): """ This function is used to test if the proxy server is still in service. We send every proxy candidate a request to get web page 'http://httpbin.org/get' for us. If the correct web page is returned then this proxy pass test.This function also used trip to max its speed. """ global valid_proxy try: r = yield trip.get('http://httpbin.org/get', timeout=40, proxies={'http': proxy, 'https': proxy }) if 'httpbin' in r.text: valid_proxy.append(proxy) print('currently, we have '+str(len(valid_proxy))+' valid proxies') except Exception as detail: print ("ERROR:", detail) else: raise trip.Return(proxy)
def test_proxy(proxy): """ Used global var valid_proxy,this function tests proxy using trip """ global ALL global valid_proxy try: print(proxy) r = yield trip.get('http://httpbin.org/get', timeout=200, proxies={ 'http': proxy, 'https': proxy }) if 'httpbin' in r.text: valid_proxy.append(proxy) valid_proxy = list(set(valid_proxy)) print('currently, we have ' + str(len(valid_proxy)) + ' valid proxies') except Exception as detail: print("ERROR:", detail) else: raise trip.Return(proxy)
def main(): r = yield trip.get('https://httpbin.org/get') print(r.content)
def get_proxies(number=10): r = yield trip.get('http://www.89ip.cn/apijk/' + '?&tqsl=%s&sxa=&sxb=&tta=&ports=&ktip=&cf=1' % number) p = re.findall('((?:\d{1,3}.){3}\d{1,3}:\d+)', r.text) raise trip.Return(p)
def async_fetch(): r = yield [trip.get(url) for i in range(times)] raise trip.Return(r)
def asyn_message(body): r = yield [trip.get(url) for url in json.loads(body.decode())] for result in r: requests_a(result)