def __fetch_sitedigger(self): catch_from = 'site-digger.net' url = 'http://www.site-digger.com/html/articles/20110516/proxieslist.html' xpath = '//*[@id="content_detail"]/div[3]/p[3]/textarea' try: html = urllib2.urlopen(url).read() doc = htmlParser(html, 'utf-8') doc = htmlParser(html, 'utf-8') content = doc.xpathEval(xpath)[0].content if not content: return False self.logger.p_log('成功代开网页,获取内容的长度为:%d 字节' % len(content)) except: self.logger.p_log('打开目标网页 %s 获取内容时出错,退出中...' % (url)) proxies = [] for ip_port in content.split('\n')[2:-2]: try: proxy = (ip_port.split(':')[0], ip_port.split(':')[1]) except Exception, e: self.logger.p_log('处理记录:%s 是发生错误' % ip_port) if proxy not in proxies: proxies.append(proxy)
def __get_ip_from_myip_cn(self, host=None, port=None): if not host or not port: return False url = 'http://www.myip.cn' xpath = '/html/body/center/div[4]/font[1]/b' proxy = {'http': 'http://' + host + ':' + port} try: html = urllib.urlopen(url, proxies=proxy).read() doc = htmlParser(html, 'utf-8') content = doc.xpathEval(xpath)[0].content if content: ip_there = content.split(' ')[1] return ip_there except Exception, e: return False