def fetch_kuaidaili() -> list: ''' 抓取快代理 ''' base_url = 'https://www.kuaidaili.com/free/inha/{}/' try: proxies = [] for page in range(1, 3): url = base_url.format(page) html = get_html(url) ip = html.xpath( '//div[@id="list"]/table/tbody/tr/td[@data-title="IP"]/text()') port = html.xpath( '//div[@id="list"]/table/tbody/tr/td[@data-title="PORT"]/text()' ) for l, r in zip(ip, port): proxies.append('{}:{}'.format(l, r)) sleep(3) except Exception as e: logger.exception('error') traceback.print_exc() proxies = [] if not proxies: logger.warning(' fail to fetch kuaidaili') return proxies
def fetch_kxdaili() -> list: ''' 抓取kaixin代理 ''' base_url = 'http://www.kxdaili.com/dailiip/1/{}.html#ip' try: proxies = [] for page in range(1, 4): url = base_url.format(page) html = get_html(url) tmp = [i.text for i in html.xpath('//tbody/tr/td')] res = [ i for i in tmp if i and (i.isdigit() or ''.join(i.split('.')).isdigit()) ] ip = [i for l, i in enumerate(res) if l % 2 == 0] port = [i for l, i in enumerate(res) if l % 2 == 1] for l, r in zip(ip, port): proxies.append('{}:{}'.format(l, r)) except Exception as e: logger.exception('error') traceback.print_exc() proxies = [] if not proxies: logger.warning(' fail to fetch kxdaili') return proxies
def fetch_3366ip() -> list: ''' 抓取3366ip代理 ''' base_url = 'http://www.ip3366.net/free/?stype=1&page={}' try: proxies = [] for page in range(1, 4): url = base_url.format(page) html = get_html(url) tmp = [i.text for i in html.xpath('//tbody/tr/td')] res = [ i for i in tmp if i.isdigit() or ''.join(i.split('.')).isdigit() ] ip = [i for l, i in enumerate(res) if l % 2 == 0] port = [i for l, i in enumerate(res) if l % 2 == 1] for l, r in zip(ip, port): proxies.append('{}:{}'.format(l, r)) except Exception as e: logger.exception('error') traceback.print_exc() proxies = [] if not proxies: logger.warning(' fail to fetch 3366ip') return proxies
def add_one(self, item: str): ''' 向表中添加1个proxy ''' try: sql = "INSERT INTO %s (IP_PORT, valid) VALUES ('%s', 1)" self.cur.execute(sql % (self.db_name, item)) self.conn.commit() except Exception as e: self.conn.rollback() logger.exception('add fail') traceback.print_exc()
def set_false(self, item: str): ''' 删除表中对应的proxy ''' try: sql = "UPDATE %s SET valid=0 WHERE IP_PORT='%s'" self.cur.execute(sql % (self.db_name, item)) self.conn.commit() except Exception as e: self.conn.rollback() logger.exception('set false fail') traceback.print_exc()
def get_all(self) -> list: ''' 从表中抽取所有proxies ''' try: sql = 'SELECT IP_PORT FROM %s' self.cur.execute(sql % self.db_name) ports = [i[0] for i in self.cur] except Exception as e: logger.exception('get all fail') traceback.print_exc() return ports
def get_invalid_items(self) -> list: ''' 从表中抽取所有无效的proxies ''' try: sql = 'SELECT IP_PORT FROM %s where valid=0' self.cur.execute(sql % self.db_name) ports = [i[0] for i in self.cur] except Exception as e: logger.exception('get invalid items fail') traceback.print_exc() return ports
def get_valid_count(self): ''' 获取表中有效代理总数 ''' try: sql = "SELECT COUNT(*) FROM %s where valid=1" self.cur.execute(sql % self.db_name) count = self.cur.fetchone()[0] except Exception as e: self.conn.rollback() logger.exception('get_count fail') traceback.print_exc() return count
def get_one(self) -> str: ''' 从表中随机抽取1个proxy return like: 127.0.0.1: 8080 ''' try: sql = "SELECT IP_PORT FROM %s where valid=1" self.cur.execute(sql % self.db_name) ports = [i[0] for i in self.cur] except Exception as e: self.conn.rollback() logger.exception('get_one fail') traceback.print_exc() return random.choice(ports)
def __init__(self): ''' 初始化数据库连接和游标,创建表proxies:只有1列IP_PORT ''' self.conn = sqlite3.connect('proxy.db') self.cur = self.conn.cursor() self.db_name = 'proxies' try: sql = "CREATE TABLE IF NOT EXISTS {} (id int PRIMARY KEY, IP_PORT varchar(20) NOT NULL, valid int(1) NOT NULL)" self.cur.execute(sql.format(self.db_name)) except Exception as e: self.conn.rollback() logger.exception('init fail') traceback.print_exc()
def fetch_66ip() -> list: ''' 抓取66ip代理 ''' url = 'http://www.66ip.cn/nmtq.php?getnum=512&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=0&proxytype=2&api=66ip' try: html = get_html(url) ip_port = html.xpath('string(//body)').split('\r\n\t\t')[1:-2] proxies = [] for i in ip_port: proxies.append(i) except Exception as e: logger.exception('error') traceback.print_exc() proxies = [] if not proxies: logger.warning(' fail to fetch 66ip') return proxies
def fetch_mimvp() -> list: ''' 抓取mimvp代理 ''' url = 'https://proxy.mimvp.com/free.php?proxy=in_hp' try: html = get_html(url) ip = html.xpath( '//div[@class="free-list"]/table/tbody/td[@class="tbl-proxy-ip"]/text()' ) port = [80 for i in ip] proxies = [] for l, r in zip(ip, port): proxies.append('{}:{}'.format(l, r)) except Exception as e: logger.exception('error') traceback.print_exc() proxies = [] if not proxies: logger.warning(' fail to fetch mimvp') return proxies
def fetch_xici() -> list: ''' 抓取xici代理 ''' base_url = 'http://www.xicidaili.com/nn/{}' try: proxies = [] for page in range(1, 3): url = base_url.format(page) html = get_html(url) ip = html.xpath('//tr[@class="odd"]/td[2]/text()') port = html.xpath('//tr[@class="odd"]/td[3]/text()') for l, r in zip(ip, port): proxies.append('{}:{}'.format(l, r)) except Exception as e: logger.exception('error') traceback.print_exc() proxies = [] if not proxies: logger.warning(' fail to fetch xici') return proxies
def fetch_ip181() -> list: ''' 抓取ip181代理 ''' url = 'http://www.ip181.com/' try: html = get_html(url) tmp = [i.text for i in html.xpath('//tbody/tr/td')] res = [ i for i in tmp if i and (i.isdigit() or ''.join(i.split('.')).isdigit()) ] ip = [i for l, i in enumerate(res) if l % 2 == 0] port = [i for l, i in enumerate(res) if l % 2 == 1] proxies = [] for l, r in zip(ip, port): proxies.append('{}:{}'.format(l, r)) except Exception as e: logger.exception('error') traceback.print_exc() proxies = [] if not proxies: logger.warning(' fail to fetch ip181') return proxies
def fetch_data5u() -> list: ''' 抓取data5u代理 ''' url = 'http://www.data5u.com/free/gngn/index.shtml' try: html = get_html(url) tmp = [i.text for i in html.xpath('//ul[@class="l2"]/span/li')] res = [ i for i in tmp if i and (i.isdigit() or ''.join(i.split('.')).isdigit()) ] ip = [i for l, i in enumerate(res) if l % 2 == 0] port = [i for l, i in enumerate(res) if l % 2 == 1] proxies = [] for l, r in zip(ip, port): proxies.append('{}:{}'.format(l, r)) except Exception as e: logger.exception('error') traceback.print_exc() proxies = [] if not proxies: logger.warning(' fail to fetch data5u') return proxies