class IpParse_4(object): """ """ def __init__(self, connector): pass self.index = 0 self.ipDao = IPDao(connector) def start(self, html): page = BeautifulSoup(html, 'lxml') ipItem = {} ipTrs = page.select('#main tbody tr') for ipTr in ipTrs: ipTds = ipTr.select('td') ipItem['ip'] = ipTds[0].get_text() ipItem['port'] = ipTds[1].get_text() ipItem['address'] = ipTds[2].get_text() ipItem['ip_type'] = "http" if ipItem['ip'] == "ip": continue # 验证存在是否 is_exist = self.ipDao.checkIpExist(ipItem['ip'], ipItem['port']) if is_exist: continue # 验证IP is_useful = checkProxyIp_1(ipItem['ip'], ipItem['port'], "http://wwww.baidu.com") if not is_useful: continue # 存到数据库 print "开启IpSpider_4抓取抓取有效IP:", ipItem['ip'], ipItem['port'] self.ipDao.save(ipItem) pass
def __init__(self, connector): threading.Thread.__init__(self) self.needNextUrl = False self.ipValid = None self.currPage = 0 self.ipDao = IPDao(connector) self.log = Log() self.canOverCatch = True # 是否需要更换参数,在服务器异常的时候不需要更换,保持原有参数 self.bookCatchRecordDao = BookCatchRecordDao(connector) self.connector = connector self.isException = False pass
def validateIp(self): connector = DBDao().getConnector() print "开启IP验证" IPDao(connector).validateIp() connector.close() # 睡5秒 Timer(5, self.validateIp).start()
def checkIP(self): self.ipValid = IPDao(self.connector).getOneIp() if self.ipValid and len(self.ipValid) >= 2: print u">新的ip:", self.ipValid else: self.ipValid = None print u">没有新的IP"
def __init__(self): self.needNextUrl = False self.ipValid = None self.currPage = 0 self.ipDao = IPDao() self.log = Log() self.needChangeParams = True # 是否需要更换参数,在服务器异常的时候不需要更换,保持原有参数 self.bookDetailDao = BookDetailParse() self.getIpFromXici = GetIpFromXici() pass
class IpParse(object): """ """ def __init__(self, connector): pass self.index = 0 self.ipDao = IPDao(connector) def start(self, html): page = BeautifulSoup(html, 'lxml') ipItem = {} ipTrs = page.select('#ip_list tr') for ipTr in ipTrs: ipTds = ipTr.select('td') if len(ipTds) < 8: continue ipItem['ip'] = ipTds[1].get_text() if not ipItem['ip'] or ipItem['ip'] == u'代理IP地址': continue ipItem['port'] = ipTds[2].get_text() ipItem['address'] = ipTds[3].get_text() ipItem['ip_type'] = ipTds[5].get_text() if ipItem['ip_type'] != u'HTTP': continue # 验证存在是否 is_exist = self.ipDao.checkIpExist(ipItem['ip'], ipItem['port']) if is_exist: continue # 验证IP is_useful = checkProxyIp_1(ipItem['ip'], ipItem['port'], "http://wwww.baidu.com") if not is_useful: continue # 存到数据库 print "IpSpider抓取有效IP:", ipItem['ip'], ipItem['port'] self.ipDao.save(ipItem) pass
def __init__(self, connector): pass self.index = 0 self.ipDao = IPDao(connector)
class TouTiaohaoRead(threading.Thread): def __init__(self, connector): threading.Thread.__init__(self) self.needNextUrl = False self.ipValid = None self.currPage = 0 self.ipDao = IPDao(connector) self.log = Log() self.canOverCatch = True # 是否需要更换参数,在服务器异常的时候不需要更换,保持原有参数 self.bookCatchRecordDao = BookCatchRecordDao(connector) self.connector = connector self.isException = False pass def run(self): urls = [ 'http://www.toutiao.com/i6477734526264017422/', 'http://www.toutiao.com/i6476422082686091790/', 'http://www.toutiao.com/i6476420572015231501/', 'http://www.toutiao.com/i6476294333824762382/', 'http://www.toutiao.com/i6475868700351136270/', 'http://www.toutiao.com/i6477135834528088589/' ] while not Global.consoleToStopCatch: for url in urls: self.request(url) time.sleep(int(format(random.randint(5, 10)))) def request(self, url): user_agent = random.choice(Constant.USER_AGENTS) try: self.checkIP() if self.ipValid and len(self.ipValid) >= 2: proxies = { "http": "http://%s:%s" % self.ipValid, "https": "http://%s:%s" % self.ipValid } print u">代理" + str(proxies) response = requests.get(url, proxies=proxies, headers={"User-Agent": user_agent}, timeout=10) req_code = response.status_code req_msg = response.reason print u">状态:%s,消息:%s" % (str(req_code), req_msg) print u"> %s" % (url, ) if req_code >= 400: self.exceptionOperate_1() else: print u">成功" else: self.ipValid = None raise requests.exceptions.ProxyError(u"没有IP") except Exception as e: print u">出错%s" % (str(e), ) print url self.exceptionOperate_1() def exceptionOperate_1(self): """ 处理异常情况1 :return: """ if self.ipValid: ip, port = self.ipValid self.ipDao.deleteIpUnUseful(ip, port) self.ipValid = None def checkIP(self): self.ipValid = IPDao(self.connector).getOneIp() if self.ipValid and len(self.ipValid) >= 2: print u">新的ip:", self.ipValid else: self.ipValid = None print u">没有新的IP"