예제 #1
0
 def start(self, html):
     page = BeautifulSoup(html, 'lxml')
     ipItem = {}
     ipTrs = page.select('#ip_list tr')
     for ipTr in ipTrs:
         ipTds = ipTr.select('td')
         if len(ipTds) < 8:
             continue
         ipItem['ip'] = ipTds[1].get_text()
         if not ipItem['ip'] or ipItem['ip'] == u'代理IP地址':
             continue
         ipItem['port'] = ipTds[2].get_text()
         ipItem['address'] = ipTds[3].get_text()
         ipItem['ip_type'] = ipTds[5].get_text()
         if ipItem['ip_type'] != u'HTTP':
             continue
         # 验证存在是否
         is_exist = self.ipDao.checkIpExist(ipItem['ip'], ipItem['port'])
         if is_exist:
             continue
         # 验证IP
         is_useful = checkProxyIp_1(ipItem['ip'], ipItem['port'],
                                    "http://wwww.baidu.com")
         if not is_useful:
             continue
         # 存到数据库
         print "IpSpider抓取有效IP:", ipItem['ip'], ipItem['port']
         self.ipDao.save(ipItem)
     pass
예제 #2
0
 def start(self, html):
     page = BeautifulSoup(html, 'lxml')
     ipItem = {}
     ipTrs = page.select('#main tbody tr')
     for ipTr in ipTrs:
         ipTds = ipTr.select('td')
         ipItem['ip'] = ipTds[0].get_text()
         ipItem['port'] = ipTds[1].get_text()
         ipItem['address'] = ipTds[2].get_text()
         ipItem['ip_type'] = "http"
         if ipItem['ip'] == "ip":
             continue
         # 验证存在是否
         is_exist = self.ipDao.checkIpExist(ipItem['ip'], ipItem['port'])
         if is_exist:
             continue
         # 验证IP
         is_useful = checkProxyIp_1(ipItem['ip'], ipItem['port'],
                                    "http://wwww.baidu.com")
         if not is_useful:
             continue
         # 存到数据库
         print "开启IpSpider_4抓取抓取有效IP:", ipItem['ip'], ipItem['port']
         self.ipDao.save(ipItem)
     pass
예제 #3
0
파일: IPDao.py 프로젝트: MaGuiSen/realstart
 def validateIp(self):
     # 从数据库中取出所有IP,正常情况下IP不多
     cursor = self.connector.cursor()
     sql_query = 'select id,ip,port from ip_address'
     cursor.execute(sql_query)
     results = cursor.fetchall()
     for id, ip, port in results:
         # 判断可用性
         is_useful = checkProxyIp_1(ip, port, 'http://wwww.baidu.com')
         if not is_useful:
             # 不可用的需要將对应ip从数据库中剔除
             sql_del = 'delete from ip_address where ip=%s and port=%s'
             cursor.execute(sql_del, (ip, port))
             self.connector.commit()
             print 'delete', ip, port
         else:
             print 'useful', ip, port
     cursor.close()