def fetch_kxdaili() -> list: ''' 抓取kaixin代理 ''' base_url = 'http://www.kxdaili.com/dailiip/1/{}.html#ip' try: proxies = [] for page in range(1, 4): url = base_url.format(page) html = get_html(url) tmp = [i.text for i in html.xpath('//tbody/tr/td')] res = [ i for i in tmp if i and (i.isdigit() or ''.join(i.split('.')).isdigit()) ] ip = [i for l, i in enumerate(res) if l % 2 == 0] port = [i for l, i in enumerate(res) if l % 2 == 1] for l, r in zip(ip, port): proxies.append('{}:{}'.format(l, r)) except Exception as e: logger.exception('error') traceback.print_exc() proxies = [] if not proxies: logger.warning(' fail to fetch kxdaili') return proxies
def fetch_kuaidaili() -> list: ''' 抓取快代理 ''' base_url = 'https://www.kuaidaili.com/free/inha/{}/' try: proxies = [] for page in range(1, 3): url = base_url.format(page) html = get_html(url) ip = html.xpath( '//div[@id="list"]/table/tbody/tr/td[@data-title="IP"]/text()') port = html.xpath( '//div[@id="list"]/table/tbody/tr/td[@data-title="PORT"]/text()' ) for l, r in zip(ip, port): proxies.append('{}:{}'.format(l, r)) sleep(3) except Exception as e: logger.exception('error') traceback.print_exc() proxies = [] if not proxies: logger.warning(' fail to fetch kuaidaili') return proxies
def fetch_3366ip() -> list: ''' 抓取3366ip代理 ''' base_url = 'http://www.ip3366.net/free/?stype=1&page={}' try: proxies = [] for page in range(1, 4): url = base_url.format(page) html = get_html(url) tmp = [i.text for i in html.xpath('//tbody/tr/td')] res = [ i for i in tmp if i.isdigit() or ''.join(i.split('.')).isdigit() ] ip = [i for l, i in enumerate(res) if l % 2 == 0] port = [i for l, i in enumerate(res) if l % 2 == 1] for l, r in zip(ip, port): proxies.append('{}:{}'.format(l, r)) except Exception as e: logger.exception('error') traceback.print_exc() proxies = [] if not proxies: logger.warning(' fail to fetch 3366ip') return proxies
def set_proxy(self, request): ''' 设置代理 ''' proxy = ProxyDatabase() count = proxy.get_valid_count() if count < self.proxies_min_count: self.fetch_proxies() if 'proxy' in request.meta.keys(): invalid_proxy = request.meta['proxy'].split('//')[1] logger.info('{} will be set false'.format(invalid_proxy)) proxy.set_false(invalid_proxy) if int(proxy.get_valid_count() ) < self.proxies_min_count and datetime.now() > ( self.last_no_proxy_time + timedelta(minutes=self.fetch_proxy_timedelta)): logger.warning( 'proxies counts are only {}, start to fetch'.format( proxy.get_valid_count())) self.fetch_proxies() if int(proxy.get_valid_count()) == 0: logger.warning('proxies all invalid, fetch new.') self.fetch_proxies() request.meta['proxy'] = 'http://' + proxy.get_one() logger.info('request proxy change to {}'.format( request.meta['proxy'])) else: request.meta['proxy'] = 'http://' + proxy.get_one() logger.info('request proxy change to {}'.format( request.meta['proxy']))
def clear(self): ''' 清除表中所有无用proxies ''' try: sql = "DELETE FROM %s WHERE valid=0" self.cur.execute(sql % self.db_name) self.conn.commit() except Exception as e: logger.warning('clear fail') traceback.print_exc()
def add_items(self, item: list): ''' 向表中添加多个proxies ''' try: sql = "INSERT INTO %s (IP_PORT, valid) VALUES ('%s', 1)" for i in item: logger.info(sql % (self.db_name, i)) self.cur.execute(sql % (self.db_name, i)) self.conn.commit() except Exception as e: self.conn.rollback() logger.warning('add_items fail') traceback.print_exc()
def fetch_66ip() -> list: ''' 抓取66ip代理 ''' url = 'http://www.66ip.cn/nmtq.php?getnum=512&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=0&proxytype=2&api=66ip' try: html = get_html(url) ip_port = html.xpath('string(//body)').split('\r\n\t\t')[1:-2] proxies = [] for i in ip_port: proxies.append(i) except Exception as e: logger.exception('error') traceback.print_exc() proxies = [] if not proxies: logger.warning(' fail to fetch 66ip') return proxies
def fetch_mimvp() -> list: ''' 抓取mimvp代理 ''' url = 'https://proxy.mimvp.com/free.php?proxy=in_hp' try: html = get_html(url) ip = html.xpath( '//div[@class="free-list"]/table/tbody/td[@class="tbl-proxy-ip"]/text()' ) port = [80 for i in ip] proxies = [] for l, r in zip(ip, port): proxies.append('{}:{}'.format(l, r)) except Exception as e: logger.exception('error') traceback.print_exc() proxies = [] if not proxies: logger.warning(' fail to fetch mimvp') return proxies
def fetch_xici() -> list: ''' 抓取xici代理 ''' base_url = 'http://www.xicidaili.com/nn/{}' try: proxies = [] for page in range(1, 3): url = base_url.format(page) html = get_html(url) ip = html.xpath('//tr[@class="odd"]/td[2]/text()') port = html.xpath('//tr[@class="odd"]/td[3]/text()') for l, r in zip(ip, port): proxies.append('{}:{}'.format(l, r)) except Exception as e: logger.exception('error') traceback.print_exc() proxies = [] if not proxies: logger.warning(' fail to fetch xici') return proxies
def fetch_ip181() -> list: ''' 抓取ip181代理 ''' url = 'http://www.ip181.com/' try: html = get_html(url) tmp = [i.text for i in html.xpath('//tbody/tr/td')] res = [ i for i in tmp if i and (i.isdigit() or ''.join(i.split('.')).isdigit()) ] ip = [i for l, i in enumerate(res) if l % 2 == 0] port = [i for l, i in enumerate(res) if l % 2 == 1] proxies = [] for l, r in zip(ip, port): proxies.append('{}:{}'.format(l, r)) except Exception as e: logger.exception('error') traceback.print_exc() proxies = [] if not proxies: logger.warning(' fail to fetch ip181') return proxies
def fetch_data5u() -> list: ''' 抓取data5u代理 ''' url = 'http://www.data5u.com/free/gngn/index.shtml' try: html = get_html(url) tmp = [i.text for i in html.xpath('//ul[@class="l2"]/span/li')] res = [ i for i in tmp if i and (i.isdigit() or ''.join(i.split('.')).isdigit()) ] ip = [i for l, i in enumerate(res) if l % 2 == 0] port = [i for l, i in enumerate(res) if l % 2 == 1] proxies = [] for l, r in zip(ip, port): proxies.append('{}:{}'.format(l, r)) except Exception as e: logger.exception('error') traceback.print_exc() proxies = [] if not proxies: logger.warning(' fail to fetch data5u') return proxies