def get_info(ip, port): # 获取代理IP的详细信息 url = CHECHINFO_IP headers = get_headers() proxies = { 'http': 'http://{ip}:{port}'.format(ip=ip, port=port), 'https': 'http://{ip}:{port}'.format(ip=ip, port=port) } params = {'ip': ip} try: start_time = time.time() resp = requests.get(url, headers=headers, params=params, proxies=proxies, timeout=15).json() end_time = time.time() speed = '%.2f' % (end_time - start_time) text = resp['data'] if text['country'] and text['region']: country = text['country'] region = text['region'] city = text['city'] isp = text['isp'] return country, region, city, isp, speed else: return None except BaseException as e: print(e) return None
def crawl_github(url): headers = get_headers() try: resp = requests.get(url, headers=headers, timeout=15).text contents = resp.split('\n')[:-2] ip = [] port = [] types = [] protocol = [] for i in contents: test = json.loads(i) if test['country'] == 'CN': ip.append(test['host']) port.append(test['port']) if test['anonymity'] == 'high_anonymous': types.append('高匿') elif test['anonymity'] == 'anonymous': types.append('匿名') else: types.append('透明') protocol.append(test['type'].upper()) return zip(ip, port, types, protocol) except BaseException as e: # print(e) return None
def download(self, url): print("Download:{}".format(url)) # 因为url在变化,所以要用异常处理 r = requests.get(url, headers=get_headers(), timeout=10) # print(chardet.detect(r.content)) # 识别编码过程{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''} r.encoding = chardet.detect(r.content)["encoding"] # 解对应编码 if r.status_code == 200 or len(r.content) > 500: # 如果访问成功 return r.text # 返回文本
def crawl_cloud(url): # 爬取云代理免费代理 headers = get_headers() try: resp = session.get(url, headers=headers) contents = resp.html.find('#list tr')[1:] ip = [] port = [] proxy_kind = [] proxy_type = [] for i in range(len(contents)): items = contents[i] contents_list = items.text.split('\n') ip.append(contents_list[0]) port.append(contents_list[1]) proxy_kind.append('高匿') proxy_type.append(contents_list[3]) return list(zip(ip, port, proxy_kind, proxy_type)) except BaseException as e: # print(e) return None
def check_ip(ip, port): # 检测代理IP是否可用 url = TEST_IP headers = get_headers() proxies = { 'http': 'http://{ip}:{port}'.format(ip=ip, port=port), 'https': 'http://{ip}:{port}'.format(ip=ip, port=port) } try: start_time = time.time() resp = requests.get(url, headers=headers, proxies=proxies, timeout=15).text end_time = time.time() speed = '%.2f' % (end_time - start_time) if ip == json.loads(resp.replace(';', '').split('=')[-1].strip())['cip']: return speed else: return None except: return None
def check_proxy(redis_to, ip_port, zname, url=None): if not url: url = "https://www.baidu.com" ip, port, *_ = ip_port.split(":") proxies = {"http": f"http://{ip}:{port}", "https": f"http://{ip}:{port}"} print("开始检测", ip_port) start_time = time.time() try: res = requests.get(url=url, headers=config.get_headers(), proxies=proxies, verify=False) if res.status_code == 200: # speed = round(time.time() - start_time, 3) speed = int(time.time() - start_time) print(f"响应时间为 {speed},可用代理{proxies}") change_score(redis_to, ip_port, zname, str(speed), 1) except Exception: change_score(redis_to, ip_port, zname, "1001", 0)
import json import requests from config import get_headers from requests_html import HTMLSession from util.browsertool import create_browser, select_em header = get_headers() urls = 'http://www.xicidaili.com/nn/1' session = HTMLSession() def crawl_xici(url): # 爬取西刺免费代理 headers = get_headers() try: resp = session.get(url, headers=headers) contents = resp.html.find('#ip_list tr')[1:] ip = [] port = [] proxy_kind = [] proxy_type = [] for i in range(len(contents)): items = contents[i] contents_list = items.text.split('\n') ip.append(contents_list[0]) port.append(contents_list[1]) if len(contents_list) >= 7: proxy_kind.append(contents_list[3]) proxy_type.append(contents_list[4]) else:
# -*- coding:utf-8 -*- from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from config import get_headers from selenium import webdriver headers = get_headers() # 创建一个浏览器引擎 def create_browser(op_type): if op_type == 'close': chrome_options = webdriver.ChromeOptions() # mobile_emulation = {"deviceName": "Galaxy S5"} chrome_options.add_argument('--headless') # chrome_options.add_experimental_option("mobileEmulation", mobile_emulation) chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('disable-infobars') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--no-sandbox') for key, value in headers.items(): chrome_options.add_argument(key + '=' + value) browser = webdriver.Chrome(chrome_options=chrome_options) browser.delete_all_cookies() return browser elif op_type == 'open': chrome_options = webdriver.ChromeOptions() # mobile_emulation = {"deviceName": "Galaxy S5"}
def download(url): print("Download:{}".format(url)) # 因为url在变化,所以要用异常处理 r = requests.get(url, headers=get_headers(), timeout=10) r.encoding = chardet.detect(r.content)["encoding"] # 解对应编码 if r.status_code == 200 or len(r.content) > 500: # 如果访问成功 return r.text # 返回文本