def run(self): url = self.url logger.debug(url) driver = webdriver.PhantomJS(executable_path=self.phantomjs_path) driver.get(url) raw_html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML") soup = BeautifulSoup(raw_html, "html5lib") table = soup.find("tbody") t_result = [] for tr in table.find_all("tr"): each_item = dict() td = tr.find_all("td") each_item['ip'] = td[0].get_text().split(";")[1].split(":")[0] each_item['port'] = td[0].get_text().split(";")[1].split(":")[1] each_item['type'] = td[1].get_text() each_item['location'] = td[2].get_text().strip() th = tr.find_all("th") each_item['time'] = th[0].get_text() t_result.append(each_item) result = [] info = dict() info['url'] = self.url info['type'] = self.type info['tag'] = self.tag result.append(info) result.append(t_result) self.result_queue.put(result)
def _run(self, join=True): for t in self.__thread_list: # 等待线程 while True: if self.__get_current_alive_thread_count() < self.__thread_count: break else: time.sleep(0.5) # 获取到了空闲的位置,从工作列表中删除已经停止的线程 for tt in self.__working_thread_list: if not tt.is_alive(): logger.debug("[*] " + tt.getName() + " deleted from working list.") self.__working_thread_list.remove(tt) # 等待到了空闲的位置,将该任务添加到工作列表中 self.__working_thread_list.append(t) # 开始线程 logger.debug("[*] " + t.getName() + " start.") t.start() if join: for tt in self.__working_thread_list: tt.join() while True: if self.is_all_thread_dead(): self.finished = True break else: time.sleep(0.5)
def __write_database(self, res): res = res[1] for r in res: # 先检测数据库中是否存在该IP # 如果IP和端口均相同 # 则认为是重复的数据,不添加到数据库中 proxy = self.session.query(Proxy).filter_by(ip=r.get("ip"), port=r.get("port")).first() if proxy: proxy.updated_time = datetime.datetime.now() try: self.session.add(proxy) self.session.commit() except Exception, e: logger.debug("Update database error. " + e.message) continue new_proxy = Proxy(ip=r.get("ip", "None"), port=r.get("port", "None"), proxy_type=r.get("type", "None"), location=r.get("location", "None"), protocol=r.get("protocol", "None"), times=r.get("time", "None"), is_alive=0, created_time=datetime.datetime.now(), updated_time=datetime.datetime.now()) try: self.session.add(new_proxy) self.session.commit() except Exception, e: logger.debug("Save database error. " + e.message)
def run(self): url = self.url logger.debug(url) driver = webdriver.PhantomJS(executable_path=self.phantomjs_path) driver.get(url) raw_html = driver.execute_script( "return document.getElementsByTagName('html')[0].innerHTML") soup = BeautifulSoup(raw_html, "html5lib") table = soup.find("tbody") t_result = [] for tr in table.find_all("tr"): each_item = dict() td = tr.find_all("td") each_item['ip'] = td[0].get_text().split(";")[1].split(":")[0] each_item['port'] = td[0].get_text().split(";")[1].split(":")[1] each_item['type'] = td[1].get_text() each_item['location'] = td[2].get_text().strip() th = tr.find_all("th") each_item['time'] = th[0].get_text() t_result.append(each_item) result = [] info = dict() info['url'] = self.url info['type'] = self.type info['tag'] = self.tag result.append(info) result.append(t_result) self.result_queue.put(result)
def _run(self, join=True): for t in self.__thread_list: # 等待线程 while True: if self.__get_current_alive_thread_count( ) < self.__thread_count: break else: time.sleep(0.5) # 获取到了空闲的位置,从工作列表中删除已经停止的线程 for tt in self.__working_thread_list: if not tt.is_alive(): logger.debug("[*] " + tt.getName() + " deleted from working list.") self.__working_thread_list.remove(tt) # 等待到了空闲的位置,将该任务添加到工作列表中 self.__working_thread_list.append(t) # 开始线程 logger.debug("[*] " + t.getName() + " start.") t.start() if join: for tt in self.__working_thread_list: tt.join() while True: if self.is_all_thread_dead(): self.finished = True break else: time.sleep(0.5)
def __write_database(self, res): res = res[1] for r in res: # 先检测数据库中是否存在该IP # 如果IP和端口均相同 # 则认为是重复的数据,不添加到数据库中 proxy = self.session.query(Proxy).filter_by( ip=r.get("ip"), port=r.get("port")).first() if proxy: proxy.updated_time = datetime.datetime.now() try: self.session.add(proxy) self.session.commit() except Exception, e: logger.debug("Update database error. " + e.message) continue new_proxy = Proxy(ip=r.get("ip", "None"), port=r.get("port", "None"), proxy_type=r.get("type", "None"), location=r.get("location", "None"), protocol=r.get("protocol", "None"), times=r.get("time", "None"), is_alive=0, created_time=datetime.datetime.now(), updated_time=datetime.datetime.now()) try: self.session.add(new_proxy) self.session.commit() except Exception, e: logger.debug("Save database error. " + e.message)
def my_run(self, page): raw_url = "http://www.kuaidaili.com/proxylist/{page}/" url = raw_url.replace("{page}", str(page)) logger.debug(url) driver = webdriver.PhantomJS(executable_path=self.phantomjs_path) driver.get(url) raw_html = driver.execute_script( "return document.getElementsByTagName('html')[0].innerHTML") soup = BeautifulSoup(raw_html, "html5lib") t_result = list() for tr in soup.find_all("tr")[1:]: each_item = {} td = tr.find_all("td") # 填充数据 each_item['ip'] = td[0].get_text() each_item['port'] = td[1].get_text() each_item['type'] = td[2].get_text() each_item['protocol'] = td[3].get_text().replace(", ", "-") each_item['location'] = td[5].get_text() each_item['time'] = filter(lambda ch: ch in '0123456789.', td[6].get_text().encode("utf8")) t_result.append(each_item) return t_result
def is_all_thread_dead(self): flags = True for t in self.__thread_list: if t.is_alive(): flags = False elif t not in self.__dead_threads: logger.debug("[*] " + t.getName() + " finished.") self.__dead_threads.append(t) return flags
def current_working_num(self): working = 0 for thread in self.working_list: if thread.isAlive(): # 线程还活着 working += 1 else: # 线程已经结束了 logger.debug("Thread %s end." % thread.name) self.working_list.remove(thread) self.dead_thread_number += 1 self.working_thread_number = working return working
def __loop(self): while True: if self.exit: logger.debug("ThreadPool loop end.") break if self.joined and self.all_thread_number == self.dead_thread_number: self.terminated() if self.current_working_num() >= self.thread_count: # 没有空闲位置了 time.sleep(1) logger.debug("No more place.") if self.DEBUG else None continue if self.func_list.empty(): # 没有任务了 time.sleep(1) logger.debug("No more task.") if self.DEBUG else None continue # 获取任务并运行 task = self.func_list.get_nowait() try: thread_name = str(task[0].im_class).split(".")[-1].split("'")[0] except AttributeError: thread_name = task[0].__name__ thread = threading.Thread(target=task[0], args=task[1], kwargs=task[2], name=thread_name) thread.start() logger.debug(thread_name + " start.") if self.DEBUG else None self.working_list.append(thread)
def _start(self, target): logger.debug("start spider " + target[0]) deep = target[1] target = target[0] # 随机取一个phantomjs进程 phantomjs_tag = random.randint(0, self.phantomjs_count - 1) self.driver_pool_lock[phantomjs_tag].acquire() retry_times = 2 while retry_times: try: self.driver_pool[phantomjs_tag].get(target) break except: # driver.close() logger.error("retry %d" % retry_times) retry_times -= 1 if not retry_times: logger.warn("Time out when get %s HTML" % target) self.driver_pool_lock[phantomjs_tag].release() return else: continue # 获取网页HTML raw_html = self.driver_pool[phantomjs_tag].execute_script( "return document.getElementsByTagName('html')[0].innerHTML") # 获取网页加载过程中发生的HTTP请求 http_log = json.loads(self.driver_pool[phantomjs_tag].get_log("har")[0] ["message"])["log"]["entries"] # 获取当前的页面URL base_url = self.driver_pool[phantomjs_tag].current_url # 释放锁 self.driver_pool_lock[phantomjs_tag].release() soup = BeautifulSoup(raw_html, "html5lib") logger.debug("Get %s HTML done. Deep: %s" % (target, deep)) # 处理文件中获取的href标签 for a in soup.find_all("a", href=True): url = a['href'].strip() # 去掉非URL的部分 if url.startswith('javascript:') or url.startswith('#') or not url: continue elif not url.startswith('https://') or not url.startswith( 'http://'): # 将相对路径转换为绝对路径 url = urlparse.urljoin(base_url, url) self.check_same_url(url, deep, self.filter_similar) # 处理打开页面时产生的请求 for log in http_log: url = log['request']['url'] logger.info(url) self.check_same_url(url, deep, self.filter_similar) logger.debug("".join(["Raw links: ", str(self.raw_links_num)])) logger.debug("".join(["Filter links: ", str(self.filter_links_num)]))
def _start(self, target): logger.debug("start spider " + target[0]) deep = target[1] target = target[0] # 随机取一个phantomjs进程 phantomjs_tag = random.randint(0, self.phantomjs_count-1) self.driver_pool_lock[phantomjs_tag].acquire() retry_times = 2 while retry_times: try: self.driver_pool[phantomjs_tag].get(target) break except: # driver.close() logger.error("retry %d" % retry_times) retry_times -= 1 if not retry_times: logger.warn("Time out when get %s HTML" % target) self.driver_pool_lock[phantomjs_tag].release() return else: continue # 获取网页HTML raw_html = self.driver_pool[phantomjs_tag].execute_script( "return document.getElementsByTagName('html')[0].innerHTML" ) # 获取网页加载过程中发生的HTTP请求 http_log = json.loads(self.driver_pool[phantomjs_tag].get_log("har")[0]["message"])["log"]["entries"] # 获取当前的页面URL base_url = self.driver_pool[phantomjs_tag].current_url # 释放锁 self.driver_pool_lock[phantomjs_tag].release() soup = BeautifulSoup(raw_html, "html5lib") logger.debug("Get %s HTML done. Deep: %s" % (target, deep)) # 处理文件中获取的href标签 for a in soup.find_all("a", href=True): url = a['href'].strip() # 去掉非URL的部分 if url.startswith('javascript:') or url.startswith('#') or not url: continue elif not url.startswith('https://') or not url.startswith('http://'): # 将相对路径转换为绝对路径 url = urlparse.urljoin(base_url, url) self.check_same_url(url, deep, self.filter_similar) # 处理打开页面时产生的请求 for log in http_log: url = log['request']['url'] logger.info(url) self.check_same_url(url, deep, self.filter_similar) logger.debug("".join(["Raw links: ", str(self.raw_links_num)])) logger.debug("".join(["Filter links: ", str(self.filter_links_num)]))
def start(self): logger.debug("start of web spider.") # 开始线程池,并且开启了线程分发器 self.spider_pool = ThreadPool(self.thread_count) # 开始爬取第一个页面 self.spider_pool.add_func(self._start, target=self.task_queue.get_nowait()) while True: if (not self.spider_pool.working_thread_number) and self.task_queue.empty(): time.sleep(2) if (not self.spider_pool.working_thread_number) and self.task_queue.empty(): self.spider_pool.terminated() logger.debug("WebSpider loop end.") break if self.task_queue.empty(): time.sleep(1) continue target = self.task_queue.get_nowait() self.spider_pool.add_func(self._start, target=(target[0], target[1])) time.sleep(0.1) logger.debug("end of web spider")
def start(self): logger.debug("start of web spider.") # 开始线程池,并且开启了线程分发器 self.spider_pool = ThreadPool(self.thread_count) # 开始爬取第一个页面 self.spider_pool.add_func(self._start, target=self.task_queue.get_nowait()) while True: if (not self.spider_pool.working_thread_number ) and self.task_queue.empty(): time.sleep(2) if (not self.spider_pool.working_thread_number ) and self.task_queue.empty(): self.spider_pool.terminated() logger.debug("WebSpider loop end.") break if self.task_queue.empty(): time.sleep(1) continue target = self.task_queue.get_nowait() self.spider_pool.add_func(self._start, target=(target[0], target[1])) time.sleep(0.1) logger.debug("end of web spider")
def _check(self, ip, port, save_to_queue=False): """ 检测给定的代理IP和端口是否存活 :param ip: 代理IP :param port: 代理端口 :param save_to_queue: 如果设置为True,则存储到结果队列中,否则不存储,默认为False :return: success, delay 如果目标代理存活,则success为True且delay为延迟,否则为False,delay为0 """ # 检查参数合法性 if ip == "" or port == "": logger.error("Invalid ip or port found. Skipping...") return False, -1.0 # 3次重试机会 retry = 3 time_summary = 0.0 success = False while retry: logger.debug("Times: {0}. Trying {1}:{2} connection...".format( 3 - retry + 1, ip, port)) proxies = {'http': ip + ":" + port} try: time_start = time.time() requests.get("http://ip.cn/", headers=self.headers, proxies=proxies, timeout=10) time_summary = time.time() - time_start success = True break except requests.RequestException: logger.warning("{0}:{1} proxy time out.".format(ip, port)) continue finally: retry -= 1 if save_to_queue: self.result_queue.put((ip, port, success, time_summary)) return success, time_summary
def _check(self, ip, port, save_to_queue=False): """ 检测给定的代理IP和端口是否存活 :param ip: 代理IP :param port: 代理端口 :param save_to_queue: 如果设置为True,则存储到结果队列中,否则不存储,默认为False :return: success, delay 如果目标代理存活,则success为True且delay为延迟,否则为False,delay为0 """ # 检查参数合法性 if ip == "" or port == "": logger.error("Invalid ip or port found. Skipping...") return False, -1.0 # 3次重试机会 retry = 3 time_summary = 0.0 success = False while retry: logger.debug("Times: {0}. Trying {1}:{2} connection...".format(3-retry+1, ip, port)) proxies = { 'http': ip + ":" + port } try: time_start = time.time() requests.get("http://ip.cn/", headers=self.headers, proxies=proxies, timeout=10) time_summary = time.time() - time_start success = True break except requests.RequestException: logger.warning("{0}:{1} proxy time out.".format(ip, port)) continue finally: retry -= 1 if save_to_queue: self.result_queue.put((ip, port, success, time_summary)) return success, time_summary
#!/usr/bin/env python2 # coding: utf-8 import ConfigParser from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker from utils.Data.LoggerHelp import logger from utils.Data.Tables import Proxy __author__ = "lightless" __email__ = "*****@*****.**" if __name__ == "__main__": cf = ConfigParser.ConfigParser() cf.read("config.ini") db_name = cf.get("Pansidong", "database") username = cf.get(db_name, "username") password = cf.get(db_name, "password") host = cf.get(db_name, "host") database = cf.get(db_name, "database") engine = create_engine("mysql://" + username + ":" + password + "@" + host + "/" + database) db_session = sessionmaker(bind=engine) try: Proxy.metadata.create_all(engine) logger.debug("Tables create success.") except Exception, e: logger.error(e.message)
def start_parse(self): # --version if self.command_args.version: print Version sys.exit(0) # --update-proxy-db if self.command_args.update_proxy_db: logger.debug("Update Proxy DB selected.") ps = ProxySpider.ProxySpider() ps.load() ps.start() sys.exit(0) # --check-proxy if self.command_args.check_proxy: logger.debug("Check proxy selected.") ips = self.command_args.check_proxy logger.debug(ips) pm = ProxyManage.ProxyManage(ips=ips) pm.check() sys.exit(0) # --check-proxy-all if self.command_args.check_proxy_all: logger.debug("Check all proxy selected.") pm = ProxyManage.ProxyManage(all=True) pm.check() sys.exit(0) # --get-alive-proxy if self.command_args.get_alive_proxy: logger.debug("Get alive proxy selected.") logger.debug(self.command_args.get_alive_proxy) pm = ProxyManage.ProxyManage() params = self.command_args.get_alive_proxy if "," in params: amount = params.split(",")[0].strip() delay = params.split(",")[1].strip() pm.get_alive_proxy(amount, delay) else: pm.get_alive_proxy(params.strip()) # --clean-db if self.command_args.clean_db: logger.debug("Clean db selected.") pm = ProxyManage.ProxyManage() pm.clean_dead_proxy()
import ConfigParser from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker from utils.Data.LoggerHelp import logger from utils.Data.Tables import Proxy __author__ = "lightless" __email__ = "*****@*****.**" if __name__ == "__main__": cf = ConfigParser.ConfigParser() cf.read("config.ini") db_name = cf.get("Pansidong", "database") username = cf.get(db_name, "username") password = cf.get(db_name, "password") host = cf.get(db_name, "host") database = cf.get(db_name, "database") engine = create_engine("mysql://" + username + ":" + password + "@" + host + "/" + database) db_session = sessionmaker(bind=engine) try: Proxy.metadata.create_all(engine) logger.debug("Tables create success.") except Exception, e: logger.error(e.message)