def start(self, html): page = BeautifulSoup(html, 'lxml') ipItem = {} ipTrs = page.select('#ip_list tr') for ipTr in ipTrs: ipTds = ipTr.select('td') if len(ipTds) < 8: continue ipItem['ip'] = ipTds[1].get_text() if not ipItem['ip'] or ipItem['ip'] == u'代理IP地址': continue ipItem['port'] = ipTds[2].get_text() ipItem['address'] = ipTds[3].get_text() ipItem['ip_type'] = ipTds[5].get_text() if ipItem['ip_type'] != u'HTTP': continue # 验证存在是否 is_exist = self.ipDao.checkIpExist(ipItem['ip'], ipItem['port']) if is_exist: continue # 验证IP is_useful = checkProxyIp_1(ipItem['ip'], ipItem['port'], "http://wwww.baidu.com") if not is_useful: continue # 存到数据库 print "IpSpider抓取有效IP:", ipItem['ip'], ipItem['port'] self.ipDao.save(ipItem) pass
def start(self, html): page = BeautifulSoup(html, 'lxml') ipItem = {} ipTrs = page.select('#main tbody tr') for ipTr in ipTrs: ipTds = ipTr.select('td') ipItem['ip'] = ipTds[0].get_text() ipItem['port'] = ipTds[1].get_text() ipItem['address'] = ipTds[2].get_text() ipItem['ip_type'] = "http" if ipItem['ip'] == "ip": continue # 验证存在是否 is_exist = self.ipDao.checkIpExist(ipItem['ip'], ipItem['port']) if is_exist: continue # 验证IP is_useful = checkProxyIp_1(ipItem['ip'], ipItem['port'], "http://wwww.baidu.com") if not is_useful: continue # 存到数据库 print "开启IpSpider_4抓取抓取有效IP:", ipItem['ip'], ipItem['port'] self.ipDao.save(ipItem) pass
def validateIp(self): # 从数据库中取出所有IP,正常情况下IP不多 cursor = self.connector.cursor() sql_query = 'select id,ip,port from ip_address' cursor.execute(sql_query) results = cursor.fetchall() for id, ip, port in results: # 判断可用性 is_useful = checkProxyIp_1(ip, port, 'http://wwww.baidu.com') if not is_useful: # 不可用的需要將对应ip从数据库中剔除 sql_del = 'delete from ip_address where ip=%s and port=%s' cursor.execute(sql_del, (ip, port)) self.connector.commit() print 'delete', ip, port else: print 'useful', ip, port cursor.close()