def check(self, amount=None): """ Check if the proxy address is valid. :return: None """ # TODO: 改成多线程检测 if amount: proxy_list = self.session.query(Proxy).filter(Proxy.id <= amount).all() else: proxy_list = self.session.query(Proxy).all() for proxy in proxy_list: proxy_ip = proxy.ip proxy_port = proxy.port logger.info("Testing %s:%s" % (proxy_ip, proxy_port)) s, t = self.__check_proxy(proxy_ip, proxy_port) logger.debug("Time: " + str(t) + " Success: " + str(s)) # 更新数据库 proxy_item = self.session.query(Proxy).filter(Proxy.id == proxy.id).first() proxy_item.times = t proxy_item.updated_time = datetime.datetime.now() if s: proxy_item.is_alive = 1 self.session.add(proxy_item) self.session.commit()
def _run(self, join=True): for t in self.__thread_list: # 等待线程 while True: if self.__get_current_alive_thread_count() < self.__thread_count: break else: time.sleep(0.5) # 获取到了空闲的位置,从工作列表中删除已经停止的线程 for tt in self.__working_thread_list: if not tt.is_alive(): logger.debug("[*] " + tt.getName() + " deleted from working list.") self.__working_thread_list.remove(tt) # 等待到了空闲的位置,将该任务添加到工作列表中 self.__working_thread_list.append(t) # 开始线程 logger.debug("[*] " + t.getName() + " start.") t.start() if join: for tt in self.__working_thread_list: tt.join() while True: if self.is_all_thread_dead(): self.finished = True break else: time.sleep(0.5)
def run(self): url = self.url logger.debug(url) driver = webdriver.PhantomJS(executable_path=self.phantomjs_path) driver.get(url) raw_html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML") soup = BeautifulSoup(raw_html, "html5lib") table = soup.find("tbody") t_result = [] for tr in table.find_all("tr"): each_item = dict() td = tr.find_all("td") each_item['ip'] = td[0].get_text().split(";")[1].split(":")[0] each_item['port'] = td[0].get_text().split(";")[1].split(":")[1] each_item['type'] = td[1].get_text() each_item['location'] = td[2].get_text().strip() th = tr.find_all("th") each_item['time'] = th[0].get_text() t_result.append(each_item) result = [] info = dict() info['url'] = self.url info['type'] = self.type info['tag'] = self.tag result.append(info) result.append(t_result) self.result_queue.put(result)
def run(self): url = self.url logger.debug(url) driver = webdriver.PhantomJS(executable_path=self.phantomjs_path) driver.get(url) raw_html = driver.execute_script( "return document.getElementsByTagName('html')[0].innerHTML") soup = BeautifulSoup(raw_html, "html5lib") table = soup.find("tbody") t_result = [] for tr in table.find_all("tr"): each_item = dict() td = tr.find_all("td") each_item['ip'] = td[0].get_text().split(";")[1].split(":")[0] each_item['port'] = td[0].get_text().split(";")[1].split(":")[1] each_item['type'] = td[1].get_text() each_item['location'] = td[2].get_text().strip() th = tr.find_all("th") each_item['time'] = th[0].get_text() t_result.append(each_item) result = [] info = dict() info['url'] = self.url info['type'] = self.type info['tag'] = self.tag result.append(info) result.append(t_result) self.result_queue.put(result)
def my_run(self, page): raw_url = "http://www.kuaidaili.com/proxylist/{page}/" url = raw_url.replace("{page}", str(page)) logger.debug(url) driver = webdriver.PhantomJS(executable_path=self.phantomjs_path) driver.get(url) raw_html = driver.execute_script( "return document.getElementsByTagName('html')[0].innerHTML") soup = BeautifulSoup(raw_html, "html5lib") t_result = list() for tr in soup.find_all("tr")[1:]: each_item = {} td = tr.find_all("td") # 填充数据 each_item['ip'] = td[0].get_text() each_item['port'] = td[1].get_text() each_item['type'] = td[2].get_text() each_item['protocol'] = td[3].get_text().replace(", ", "-") each_item['location'] = td[5].get_text() each_item['time'] = filter(lambda ch: ch in '0123456789.', td[6].get_text().encode("utf8")) t_result.append(each_item) return t_result
def __loop(self): while True: if self.exit: logger.debug("ThreadPool loop end.") break if self.current_working_num() >= self.thread_count: # 没有空闲位置了 time.sleep(1) continue if self.func_list.empty(): # 没有任务了 time.sleep(1) continue # 获取任务并运行 task = self.func_list.get_nowait() try: thread_name = str(task[0].im_class).split(".")[-1].split("'")[0] except AttributeError: thread_name = task[0].__name__ thread = threading.Thread(target=task[0], args=task[1], kwargs=task[2], name=thread_name) thread.start() self.working_list.append(thread)
def __write_database(self, res): res = res[1] for r in res: # 先检测数据库中是否存在该IP # 如果IP和端口均相同 # 则认为是重复的数据,不添加到数据库中 proxy = self.session.query(Proxy).filter_by( ip=r.get("ip"), port=r.get("port")).first() if proxy: proxy.updated_time = datetime.datetime.now() try: self.session.add(proxy) self.session.commit() except Exception, e: logger.debug("Update database error. " + e.message) continue new_proxy = Proxy(ip=r.get("ip", "None"), port=r.get("port", "None"), proxy_type=r.get("type", "None"), location=r.get("location", "None"), protocol=r.get("protocol", "None"), times=r.get("time", "None"), is_alive=0, created_time=datetime.datetime.now(), updated_time=datetime.datetime.now()) try: self.session.add(new_proxy) self.session.commit() except Exception, e: logger.debug("Save database error. " + e.message)
def _run(self, join=True): for t in self.__thread_list: # 等待线程 while True: if self.__get_current_alive_thread_count( ) < self.__thread_count: break else: time.sleep(0.5) # 获取到了空闲的位置,从工作列表中删除已经停止的线程 for tt in self.__working_thread_list: if not tt.is_alive(): logger.debug("[*] " + tt.getName() + " deleted from working list.") self.__working_thread_list.remove(tt) # 等待到了空闲的位置,将该任务添加到工作列表中 self.__working_thread_list.append(t) # 开始线程 logger.debug("[*] " + t.getName() + " start.") t.start() if join: for tt in self.__working_thread_list: tt.join() while True: if self.is_all_thread_dead(): self.finished = True break else: time.sleep(0.5)
def __loop(self): while True: if self.exit: logger.debug("ThreadPool loop end.") break if self.current_working_num() >= self.thread_count: # 没有空闲位置了 time.sleep(1) continue if self.func_list.empty(): # 没有任务了 time.sleep(1) continue # 获取任务并运行 task = self.func_list.get_nowait() try: thread_name = str( task[0].im_class).split(".")[-1].split("'")[0] except AttributeError: thread_name = task[0].__name__ thread = threading.Thread(target=task[0], args=task[1], kwargs=task[2], name=thread_name) thread.start() self.working_list.append(thread)
def check(self, amount=None): """ Check if the proxy address is valid. :return: None """ # TODO: 改成多线程检测 if amount: proxy_list = self.session.query(Proxy).filter( Proxy.id <= amount).all() else: proxy_list = self.session.query(Proxy).all() for proxy in proxy_list: proxy_ip = proxy.ip proxy_port = proxy.port logger.info("Testing %s:%s" % (proxy_ip, proxy_port)) s, t = self.__check_proxy(proxy_ip, proxy_port) logger.debug("Time: " + str(t) + " Success: " + str(s)) # 更新数据库 proxy_item = self.session.query(Proxy).filter( Proxy.id == proxy.id).first() proxy_item.times = t proxy_item.updated_time = datetime.datetime.now() if s: proxy_item.is_alive = 1 self.session.add(proxy_item) self.session.commit()
def is_all_thread_dead(self): flags = True for t in self.__thread_list: if t.is_alive(): flags = False elif t not in self.__dead_threads: logger.debug("[*] " + t.getName() + " finished.") self.__dead_threads.append(t) return flags
def is_all_thread_dead(self): flags = True for t in self.__thread_list: if t.is_alive(): flags = False elif t not in self.__dead_threads: logger.debug("[*] " + t.getName() + " finished.") self.__dead_threads.append(t) return flags
def current_working_num(self): working = 0 for thread in self.working_list: if thread.isAlive(): # 线程还活着 working += 1 else: # 线程已经结束了 logger.debug("Thread %s end." % thread.name) self.working_list.remove(thread) self.working_thread_number = working return working
def __write_database(self, res): res = res[1] for r in res: new_proxy = Proxy(ip=r.get("ip", "None"), port=r.get("port", "None"), proxy_type=r.get("type", "None"), location=r.get("location", "None"), protocol=r.get("protocol", "None"), times=r.get("time", "None"), created_time=datetime.datetime.now(), updated_time=datetime.datetime.now()) try: self.session.add(new_proxy) self.session.commit() except Exception, e: logger.debug("save database error. " + e.message)
def current_working_num(self): working = 0 for thread in self.working_list: if thread.isAlive(): # 线程还活着 working += 1 else: # 线程已经结束了 logger.debug("Thread %s end." % thread.name) self.working_list.remove(thread) self.working_thread_number = working return working
def _start(self, target): logger.debug("start spider " + target[0]) deep = target[1] target = target[0] # 随机取一个phantomjs进程 phantomjs_tag = random.randint(0, self.phantomjs_count - 1) self.driver_pool_lock[phantomjs_tag].acquire() retry_times = 2 while retry_times: try: self.driver_pool[phantomjs_tag].get(target) break except: # driver.close() logger.error("retry %d" % retry_times) retry_times -= 1 if not retry_times: logger.warn("Time out when get %s HTML" % target) self.driver_pool_lock[phantomjs_tag].release() return else: continue # 获取网页HTML raw_html = self.driver_pool[phantomjs_tag].execute_script( "return document.getElementsByTagName('html')[0].innerHTML") # 获取网页加载过程中发生的HTTP请求 http_log = json.loads(self.driver_pool[phantomjs_tag].get_log("har")[0] ["message"])["log"]["entries"] # 获取当前的页面URL base_url = self.driver_pool[phantomjs_tag].current_url # 释放锁 self.driver_pool_lock[phantomjs_tag].release() soup = BeautifulSoup(raw_html, "html5lib") logger.debug("Get %s HTML done. Deep: %s" % (target, deep)) # 处理文件中获取的href标签 for a in soup.find_all("a", href=True): url = a['href'].strip() # 去掉非URL的部分 if url.startswith('javascript:') or url.startswith('#') or not url: continue elif not url.startswith('https://') or not url.startswith( 'http://'): # 将相对路径转换为绝对路径 url = urlparse.urljoin(base_url, url) self.check_same_url(url, deep, self.filter_similar) # 处理打开页面时产生的请求 for log in http_log: url = log['request']['url'] logger.info(url) self.check_same_url(url, deep, self.filter_similar) logger.debug("".join(["Raw links: ", str(self.raw_links_num)])) logger.debug("".join(["Filter links: ", str(self.filter_links_num)]))
def _start(self, target): logger.debug("start spider " + target[0]) deep = target[1] target = target[0] # 随机取一个phantomjs进程 phantomjs_tag = random.randint(0, self.phantomjs_count-1) self.driver_pool_lock[phantomjs_tag].acquire() retry_times = 2 while retry_times: try: self.driver_pool[phantomjs_tag].get(target) break except: # driver.close() logger.error("retry %d" % retry_times) retry_times -= 1 if not retry_times: logger.warn("Time out when get %s HTML" % target) self.driver_pool_lock[phantomjs_tag].release() return else: continue # 获取网页HTML raw_html = self.driver_pool[phantomjs_tag].execute_script( "return document.getElementsByTagName('html')[0].innerHTML" ) # 获取网页加载过程中发生的HTTP请求 http_log = json.loads(self.driver_pool[phantomjs_tag].get_log("har")[0]["message"])["log"]["entries"] # 获取当前的页面URL base_url = self.driver_pool[phantomjs_tag].current_url # 释放锁 self.driver_pool_lock[phantomjs_tag].release() soup = BeautifulSoup(raw_html, "html5lib") logger.debug("Get %s HTML done. Deep: %s" % (target, deep)) # 处理文件中获取的href标签 for a in soup.find_all("a", href=True): url = a['href'].strip() # 去掉非URL的部分 if url.startswith('javascript:') or url.startswith('#') or not url: continue elif not url.startswith('https://') or not url.startswith('http://'): # 将相对路径转换为绝对路径 url = urlparse.urljoin(base_url, url) self.check_same_url(url, deep, self.filter_similar) # 处理打开页面时产生的请求 for log in http_log: url = log['request']['url'] logger.info(url) self.check_same_url(url, deep, self.filter_similar) logger.debug("".join(["Raw links: ", str(self.raw_links_num)])) logger.debug("".join(["Filter links: ", str(self.filter_links_num)]))
def start(self): logger.debug("start of web spider.") # 开始线程池,并且开启了线程分发器 self.spider_pool = ThreadPool(self.thread_count) # 开始爬取第一个页面 self.spider_pool.add_func(self._start, target=self.task_queue.get_nowait()) while True: if (not self.spider_pool.working_thread_number ) and self.task_queue.empty(): time.sleep(2) if (not self.spider_pool.working_thread_number ) and self.task_queue.empty(): self.spider_pool.terminated() logger.debug("WebSpider loop end.") break if self.task_queue.empty(): time.sleep(1) continue target = self.task_queue.get_nowait() self.spider_pool.add_func(self._start, target=(target[0], target[1])) time.sleep(0.1) logger.debug("end of web spider")
def start(self): logger.debug("start of web spider.") # 开始线程池,并且开启了线程分发器 self.spider_pool = ThreadPool(self.thread_count) # 开始爬取第一个页面 self.spider_pool.add_func(self._start, target=self.task_queue.get_nowait()) while True: if (not self.spider_pool.working_thread_number) and self.task_queue.empty(): time.sleep(2) if (not self.spider_pool.working_thread_number) and self.task_queue.empty(): self.spider_pool.terminated() logger.debug("WebSpider loop end.") break if self.task_queue.empty(): time.sleep(1) continue target = self.task_queue.get_nowait() self.spider_pool.add_func(self._start, target=(target[0], target[1])) time.sleep(0.1) logger.debug("end of web spider")
def my_run(self, page): raw_url = "http://www.kuaidaili.com/proxylist/{page}/" url = raw_url.replace("{page}", str(page)) logger.debug(url) driver = webdriver.PhantomJS(executable_path=self.phantomjs_path) driver.get(url) raw_html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML") soup = BeautifulSoup(raw_html, "html5lib") t_result = list() for tr in soup.find_all("tr")[1:]: each_item = {} td = tr.find_all("td") # 填充数据 each_item['ip'] = td[0].get_text() each_item['port'] = td[1].get_text() each_item['type'] = td[2].get_text() each_item['protocol'] = td[3].get_text().replace(", ", "-") each_item['location'] = td[5].get_text() each_item['time'] = filter(lambda ch: ch in '0123456789.', td[6].get_text().encode("utf8")) t_result.append(each_item) return t_result
web_spider = WebSpider( target="http://www.yundaex.com/", limit_domain=['*.yundaex.com'], deep=5, thread_count=50 ) web_spider.do_spider() # web_spider.start() # while True: # time.sleep(1) # print web_spider.links # time.sleep(1) while True: time.sleep(5) logger.debug("Alive thread: %d" % web_spider.spider_pool.working_thread_number) logger.debug("Left tasks number: %d" % web_spider.task_queue.qsize()) logger.debug("links num before filter: %d" % web_spider.raw_links_num) logger.debug("links num after filter: %d" % web_spider.filter_links_num) if web_spider.spider_pool.working_thread_number == 0: break # # print web_spider.links with open("urls.txt", "w") as ff: for url in web_spider.links: ff.write(url.decode('utf8') + "\n") # urls = ['www.lightless.me', 'www.baidu.com'] # jobs = [gevent.spawn(socket.gethostbyname, url) for url in urls] # # gevent.joinall(jobs)
import ConfigParser from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker from utils.data.LoggerHelp import logger from utils.data.Tables import Proxy __author__ = "lightless" __email__ = "*****@*****.**" if __name__ == "__main__": cf = ConfigParser.ConfigParser() cf.read("config.ini") db_name = cf.get("ProxySpider", "database") username = cf.get(db_name, "username") password = cf.get(db_name, "password") host = cf.get(db_name, "host") database = cf.get(db_name, "database") engine = create_engine("mysql://" + username + ":" + password + "@" + host + "/" + database) db_session = sessionmaker(bind=engine) try: Proxy.metadata.create_all(engine) logger.debug("Tables create success.") except Exception, e: logger.error(e.message)