Esempio n. 1
0
    def start(self):
        if not len(self.al.spiders):
            logger.error("No Spiders loaded. exit.")
            sys.exit(1)
        else:
            message = "Loaded spiders: "
            for s in self.al.spiders:
                message += str(s.__class__).split(".")[-1].split("'")[0] + ", "
            logger.info(message.strip(", "))
        # 创建线程池
        if self.spider_threads:
            self.tp = ThreadPool(self.spider_threads)
        else:
            self.tp = ThreadPool()
        for sp in self.al.spiders:
            # 将spider中的run方法添加到线程池中
            self.tp.add_function(sp.run)
        # 开始线程池
        self.tp.run(join=False)

        # 输出结果
        self.sd = SaveData(self.al.results, self.tp, use_file=self.output_file, use_database=self.output_db,
                           filename=self.output_filename)
        if self.save_data_threads:
            self.write_file_tp = ThreadPool(self.save_data_threads)
        else:
            self.write_file_tp = ThreadPool()
        self.write_file_tp = ThreadPool()
        self.write_file_tp.add_function(self.sd.write)
        self.write_file_tp.run()
Esempio n. 2
0
    def start(self):
        if not len(self.al.spiders):
            logger.error("No Spiders loaded. exit.")
            sys.exit(1)
        else:
            message = "Loaded spiders: "
            for s in self.al.spiders:
                message += str(s.__class__).split(".")[-1].split("'")[0] + ", "
            logger.info(message.strip(", "))
        # 创建线程池
        if self.spider_threads:
            self.tp = ThreadPool(self.spider_threads)
        else:
            self.tp = ThreadPool()
        for sp in self.al.spiders:
            # 将spider中的run方法添加到线程池中
            self.tp.add_function(sp.run)
        # 开始线程池
        self.tp.run(join=False)

        # 输出结果
        self.sd = SaveData(self.al.results,
                           self.tp,
                           use_file=self.output_file,
                           use_database=self.output_db,
                           filename=self.output_filename)
        if self.save_data_threads:
            self.write_file_tp = ThreadPool(self.save_data_threads)
        else:
            self.write_file_tp = ThreadPool()
        self.write_file_tp = ThreadPool()
        self.write_file_tp.add_function(self.sd.write)
        self.write_file_tp.run()
Esempio n. 3
0
 def _check_ip_all(self):
     rows = self.session.query(Proxy).all()
     self.thread_pool = ThreadPool(
         thread_count=10 if not len(rows) / 20 else len(rows) / 20)
     for row in rows:
         self.thread_pool.add_func(self._check,
                                   ip=row.ip,
                                   port=row.port,
                                   save_to_queue=True)
     self.thread_pool.close()
     self.thread_pool.join()
     while True:
         if self.thread_pool.exit is True and self.result_queue.empty():
             break
         else:
             try:
                 res = self.result_queue.get_nowait()
                 ip = res[0]
                 port = res[1]
                 delay = res[3]
                 alive = res[2]
                 logger.info("IP {0} Connect {1}, time: {2:.2f}s".format(ip, "success", delay)) if alive \
                     else logger.error("IP {0} Connect failed.".format(ip))
                 self._update_db(ip, port, delay, alive)
             except Queue.Empty:
                 time.sleep(2)
Esempio n. 4
0
    def _start(self, target):
        logger.debug("start spider " + target[0])
        deep = target[1]
        target = target[0]

        # 随机取一个phantomjs进程
        phantomjs_tag = random.randint(0, self.phantomjs_count - 1)

        self.driver_pool_lock[phantomjs_tag].acquire()
        retry_times = 2
        while retry_times:
            try:
                self.driver_pool[phantomjs_tag].get(target)
                break
            except:
                # driver.close()
                logger.error("retry %d" % retry_times)
                retry_times -= 1
                if not retry_times:
                    logger.warn("Time out when get %s HTML" % target)
                    self.driver_pool_lock[phantomjs_tag].release()
                    return
                else:
                    continue

        # 获取网页HTML
        raw_html = self.driver_pool[phantomjs_tag].execute_script(
            "return document.getElementsByTagName('html')[0].innerHTML")
        # 获取网页加载过程中发生的HTTP请求
        http_log = json.loads(self.driver_pool[phantomjs_tag].get_log("har")[0]
                              ["message"])["log"]["entries"]
        # 获取当前的页面URL
        base_url = self.driver_pool[phantomjs_tag].current_url
        # 释放锁
        self.driver_pool_lock[phantomjs_tag].release()

        soup = BeautifulSoup(raw_html, "html5lib")
        logger.debug("Get %s HTML done. Deep: %s" % (target, deep))

        # 处理文件中获取的href标签
        for a in soup.find_all("a", href=True):
            url = a['href'].strip()
            # 去掉非URL的部分
            if url.startswith('javascript:') or url.startswith('#') or not url:
                continue
            elif not url.startswith('https://') or not url.startswith(
                    'http://'):
                # 将相对路径转换为绝对路径
                url = urlparse.urljoin(base_url, url)
            self.check_same_url(url, deep, self.filter_similar)

        # 处理打开页面时产生的请求
        for log in http_log:
            url = log['request']['url']
            logger.info(url)
            self.check_same_url(url, deep, self.filter_similar)

        logger.debug("".join(["Raw links: ", str(self.raw_links_num)]))
        logger.debug("".join(["Filter links: ", str(self.filter_links_num)]))
Esempio n. 5
0
    def _start(self, target):
        logger.debug("start spider " + target[0])
        deep = target[1]
        target = target[0]

        # 随机取一个phantomjs进程
        phantomjs_tag = random.randint(0, self.phantomjs_count-1)

        self.driver_pool_lock[phantomjs_tag].acquire()
        retry_times = 2
        while retry_times:
            try:
                self.driver_pool[phantomjs_tag].get(target)
                break
            except:
                # driver.close()
                logger.error("retry %d" % retry_times)
                retry_times -= 1
                if not retry_times:
                    logger.warn("Time out when get %s HTML" % target)
                    self.driver_pool_lock[phantomjs_tag].release()
                    return
                else:
                    continue

        # 获取网页HTML
        raw_html = self.driver_pool[phantomjs_tag].execute_script(
            "return document.getElementsByTagName('html')[0].innerHTML"
        )
        # 获取网页加载过程中发生的HTTP请求
        http_log = json.loads(self.driver_pool[phantomjs_tag].get_log("har")[0]["message"])["log"]["entries"]
        # 获取当前的页面URL
        base_url = self.driver_pool[phantomjs_tag].current_url
        # 释放锁
        self.driver_pool_lock[phantomjs_tag].release()

        soup = BeautifulSoup(raw_html, "html5lib")
        logger.debug("Get %s HTML done. Deep: %s" % (target, deep))

        # 处理文件中获取的href标签
        for a in soup.find_all("a", href=True):
            url = a['href'].strip()
            # 去掉非URL的部分
            if url.startswith('javascript:') or url.startswith('#') or not url:
                continue
            elif not url.startswith('https://') or not url.startswith('http://'):
                # 将相对路径转换为绝对路径
                url = urlparse.urljoin(base_url, url)
            self.check_same_url(url, deep, self.filter_similar)

        # 处理打开页面时产生的请求
        for log in http_log:
            url = log['request']['url']
            logger.info(url)
            self.check_same_url(url, deep, self.filter_similar)

        logger.debug("".join(["Raw links: ", str(self.raw_links_num)]))
        logger.debug("".join(["Filter links: ", str(self.filter_links_num)]))
Esempio n. 6
0
 def __write_file(self, res):
     self.ff.writelines(res[0].get('url') + "\n")
     self.ff.writelines("ip,port,type,protocol,location,time(s)\n")
     logger.info("[*] url: " + res[0].get('url'))
     res = res[1]
     for r in res:
         line = r.get('ip', 'None') + "," + r.get('port', 'None') + "," + \
                r.get('type', 'None') + "," + r.get('protocol', 'None') + "," + \
                r.get('location', 'None') + "," + r.get('time', 'None')
         logger.info("[*] " + line)
         self.ff.writelines((line + "\n").encode("utf8"))
Esempio n. 7
0
 def __write_file(self, res):
     self.ff.writelines(res[0].get('url') + "\n")
     self.ff.writelines("ip,port,type,protocol,location,time(s)\n")
     logger.info("[*] url: " + res[0].get('url'))
     res = res[1]
     for r in res:
         line = r.get('ip', 'None') + "," + r.get('port', 'None') + "," + \
                r.get('type', 'None') + "," + r.get('protocol', 'None') + "," + \
                r.get('location', 'None') + "," + r.get('time', 'None')
         logger.info("[*] " + line)
         self.ff.writelines((line + "\n").encode("utf8"))
Esempio n. 8
0
 def _check_ip_list(self, raw_ips):
     try:
         if raw_ips is not None and len(raw_ips):
             ips = raw_ips.split(",")
             for ip in ips:
                 ip_stu = ip.strip().split(":")
                 s, t = self._check(ip_stu[0], ip_stu[1])
                 logger.info("IP {0} Connect {1}, time: {2:.2f}s".format(ip, "success", t)) if s \
                     else logger.error("IP {0} Connect failed.".format(ip))
                 self._update_db(ip_stu[0], ip_stu[1], t, s)
         else:
             logger.fatal("No IP provide.")
             sys.exit(1)
     except KeyError:
         logger.fatal("No IP provide.")
         sys.exit(1)
Esempio n. 9
0
 def _check_ip_list(self, raw_ips):
     try:
         if raw_ips is not None and len(raw_ips):
             ips = raw_ips.split(",")
             for ip in ips:
                 ip_stu = ip.strip().split(":")
                 s, t = self._check(ip_stu[0], ip_stu[1])
                 logger.info("IP {0} Connect {1}, time: {2:.2f}s".format(ip, "success", t)) if s \
                     else logger.error("IP {0} Connect failed.".format(ip))
                 self._update_db(ip_stu[0], ip_stu[1], t, s)
         else:
             logger.fatal("No IP provide.")
             sys.exit(1)
     except KeyError:
         logger.fatal("No IP provide.")
         sys.exit(1)
Esempio n. 10
0
 def _check_ip_all(self):
     rows = self.session.query(Proxy).all()
     self.thread_pool = ThreadPool(thread_count=10 if not len(rows)/20 else len(rows)/20)
     for row in rows:
         self.thread_pool.add_func(self._check, ip=row.ip, port=row.port, save_to_queue=True)
     self.thread_pool.close()
     self.thread_pool.join()
     while True:
         if self.thread_pool.exit is True and self.result_queue.empty():
             break
         else:
             try:
                 res = self.result_queue.get_nowait()
                 ip = res[0]
                 port = res[1]
                 delay = res[3]
                 alive = res[2]
                 logger.info("IP {0} Connect {1}, time: {2:.2f}s".format(ip, "success", delay)) if alive \
                     else logger.error("IP {0} Connect failed.".format(ip))
                 self._update_db(ip, port, delay, alive)
             except Queue.Empty:
                 time.sleep(2)
Esempio n. 11
0
 def clean_dead_proxy(self):
     try:
         logger.info("Start clean dead proxy in db.")
         dead_proxy = self.session.query(Proxy).filter(Proxy.is_alive == "0").all()
         logger.info("Found {} dead proxy in db.".format(len(dead_proxy)))
         for dp in dead_proxy:
             self.session.delete(dp)
         self.session.commit()
         logger.info("Clean done. {} dead proxies cleaned.".format(len(dead_proxy)))
     except SQLAlchemyError:
         logger.fatal("Error occurred when clean dead proxy from db.")
         sys.exit(1)
Esempio n. 12
0
 def clean_dead_proxy(self):
     try:
         logger.info("Start clean dead proxy in db.")
         dead_proxy = self.session.query(Proxy).filter(
             Proxy.is_alive == "0").all()
         logger.info("Found {} dead proxy in db.".format(len(dead_proxy)))
         for dp in dead_proxy:
             self.session.delete(dp)
         self.session.commit()
         logger.info("Clean done. {} dead proxies cleaned.".format(
             len(dead_proxy)))
     except SQLAlchemyError:
         logger.fatal("Error occurred when clean dead proxy from db.")
         sys.exit(1)
Esempio n. 13
0
    def check_same_url(self, url, deep, filter_similar):

        # 判断URL的后缀是否为图片等
        url_st = urlparse.urlparse(url)
        suffix = url_st.path.split(".")[-1]
        if suffix.lower() in [
                "jpg", "png", "gif", "jpeg", "bmp", "css", "ttf"
        ]:
            return

        self.raw_links_num += 1

        # 先判断域名在不在目标域中
        if self.check_domain_limit(url):
            # 在目标域中,判断参数格式
            # 如果已经在set中,说明之前爬到过类似参数的页面,直接return
            # 如果不在set中,说明之前未出现过,继续向下执行处理,并将其添加到set中
            formatted_url = self.format_url_param(url)
            # logger.warning(formatted_url)
            if formatted_url is not None:
                if formatted_url not in self.url_param_set:
                    self.url_param_set.add(formatted_url)
                else:
                    return

            # 格式化url
            r, suffix = self.format_url(url)
            if suffix:
                # 有后缀,正常页面,根据是否判断相似性的设置继续判断
                if filter_similar and (r not in self.url_set):
                    self.filter_links_num += 1
                    self.url_set.add(r)
                    self.links.append(url)
                    logger.info(url)
                    if deep + 1 <= self.deep:
                        self.task_queue.put((url, deep + 1))
                elif not filter_similar and (url not in self.links):
                    self.filter_links_num += 1
                    self.links.append(url)
                    logger.info(url)
                    if deep + 1 <= self.deep:
                        self.task_queue.put((url, deep + 1))
            else:
                # 没有后缀,是个目录,去重,不去相似
                if url not in self.links:
                    self.filter_links_num += 1
                    self.links.append(url)
                    logger.info(url)
                    if deep + 1 <= self.deep:
                        self.task_queue.put((url, deep + 1))
Esempio n. 14
0
    def check_same_url(self, url, deep, filter_similar):

        # 判断URL的后缀是否为图片等
        url_st = urlparse.urlparse(url)
        suffix = url_st.path.split(".")[-1]
        if suffix.lower() in ["jpg", "png", "gif", "jpeg", "bmp", "css", "ttf"]:
            return

        self.raw_links_num += 1

        # 先判断域名在不在目标域中
        if self.check_domain_limit(url):
            # 在目标域中,判断参数格式
            # 如果已经在set中,说明之前爬到过类似参数的页面,直接return
            # 如果不在set中,说明之前未出现过,继续向下执行处理,并将其添加到set中
            formatted_url = self.format_url_param(url)
            # logger.warning(formatted_url)
            if formatted_url is not None:
                if formatted_url not in self.url_param_set:
                    self.url_param_set.add(formatted_url)
                else:
                    return

            # 格式化url
            r, suffix = self.format_url(url)
            if suffix:
                # 有后缀,正常页面,根据是否判断相似性的设置继续判断
                if filter_similar and (r not in self.url_set):
                    self.filter_links_num += 1
                    self.url_set.add(r)
                    self.links.append(url)
                    logger.info(url)
                    if deep + 1 <= self.deep:
                        self.task_queue.put((url, deep + 1))
                elif not filter_similar and (url not in self.links):
                    self.filter_links_num += 1
                    self.links.append(url)
                    logger.info(url)
                    if deep + 1 <= self.deep:
                        self.task_queue.put((url, deep + 1))
            else:
                # 没有后缀,是个目录,去重,不去相似
                if url not in self.links:
                    self.filter_links_num += 1
                    self.links.append(url)
                    logger.info(url)
                    if deep + 1 <= self.deep:
                        self.task_queue.put((url, deep + 1))
Esempio n. 15
0
 def terminate(self):
     self._exit = True
     logger.info("Send exit to WebClicker..Wait for quit.")
Esempio n. 16
0
    def __init__(self, target, deep=1, limit_domain=list(), thread_count=cpu_count()*2,
                 phantomjs_count=cpu_count(), filter_similar=False):

        # 设置phantomjs路径
        SpiderBase.__init__(self)
        SpiderBase.set_phantomjs_path(self)

        # 设置参数
        self.target = target
        self.deep = deep
        if limit_domain:
            self.limit_domain = limit_domain
        else:
            self.limit_domain = ".".join(tldextract.extract(self.target))
        self.thread_count = thread_count
        self.phantomjs_count = phantomjs_count
        self.filter_similar = filter_similar

        # 去重用的set
        self.url_set = set()
        self.url_param_set = set()
        # 存储爬虫结果的list
        self.links = list()
        # 待爬取的队列
        self.task_queue = Queue.Queue()
        self.spider_pool = None

        # 将初始目标置于待爬取的队列中
        self.task_queue.put((self.target, 0))

        # 统计信息
        self.raw_links_num = 0
        self.filter_links_num = 0
        self.links_num = 0

        # 初始化 webdriver
        # dcap 好像无效
        self.dcap = dict(DesiredCapabilities.PHANTOMJS)
        self.dcap["phantomjs.page.settings.resourceTimeout"] = 10
        self.dcap["phantomjs.page.settings.loadImages"] = False

        self.service_args = [
            "--webdriver-loglevel=DEBUG",
            "--webdriver-logfile=phantomjs.log"
            "--load-images=no",
            "--disk-cache=true"
        ]

        # webdriver进程池
        logger.info("initial web spider phantomjs process pool...")
        self.driver_pool = list()
        self.driver_pool_lock = list()
        for i in range(self.phantomjs_count):
            self.driver_pool.append(
                webdriver.PhantomJS(executable_path=self.phantomjs_path, desired_capabilities=self.dcap,
                                    service_args=self.service_args
                                    )
            )
            self.driver_pool_lock.append(
                threading.Lock()
            )
            logger.info("%.2f%% finished." % ((float(i + 1) * 100) / float(self.phantomjs_count)))
        logger.info("initial finished.")
Esempio n. 17
0
    def __init__(self,
                 target,
                 deep=1,
                 limit_domain=list(),
                 thread_count=cpu_count() * 2,
                 phantomjs_count=cpu_count(),
                 filter_similar=False):

        # 设置phantomjs路径
        SpiderBase.__init__(self)
        SpiderBase.set_phantomjs_path(self)

        # 设置参数
        self.target = target
        self.deep = deep
        if limit_domain:
            self.limit_domain = limit_domain
        else:
            self.limit_domain = ".".join(tldextract.extract(self.target))
        self.thread_count = thread_count
        self.phantomjs_count = phantomjs_count
        self.filter_similar = filter_similar

        # 去重用的set
        self.url_set = set()
        self.url_param_set = set()
        # 存储爬虫结果的list
        self.links = list()
        # 待爬取的队列
        self.task_queue = Queue.Queue()
        self.spider_pool = None

        # 将初始目标置于待爬取的队列中
        self.task_queue.put((self.target, 0))

        # 统计信息
        self.raw_links_num = 0
        self.filter_links_num = 0
        self.links_num = 0

        # 初始化 webdriver
        # dcap 好像无效
        self.dcap = dict(DesiredCapabilities.PHANTOMJS)
        self.dcap["phantomjs.page.settings.resourceTimeout"] = 10
        self.dcap["phantomjs.page.settings.loadImages"] = False

        self.service_args = [
            "--webdriver-loglevel=DEBUG", "--webdriver-logfile=phantomjs.log"
            "--load-images=no", "--disk-cache=true"
        ]

        # webdriver进程池
        logger.info("initial web spider phantomjs process pool...")
        self.driver_pool = list()
        self.driver_pool_lock = list()
        for i in range(self.phantomjs_count):
            self.driver_pool.append(
                webdriver.PhantomJS(executable_path=self.phantomjs_path,
                                    desired_capabilities=self.dcap,
                                    service_args=self.service_args))
            self.driver_pool_lock.append(threading.Lock())
            logger.info("%.2f%% finished." %
                        ((float(i + 1) * 100) / float(self.phantomjs_count)))
        logger.info("initial finished.")