Exemple #1
0
    def check(self, amount=None):
        """
        Check if the proxy address is valid.
        :return: None
        """
        # TODO: 改成多线程检测
        if amount:
            proxy_list = self.session.query(Proxy).filter(
                Proxy.id <= amount).all()
        else:
            proxy_list = self.session.query(Proxy).all()
        for proxy in proxy_list:
            proxy_ip = proxy.ip
            proxy_port = proxy.port
            logger.info("Testing %s:%s" % (proxy_ip, proxy_port))
            s, t = self.__check_proxy(proxy_ip, proxy_port)
            logger.debug("Time: " + str(t) + " Success: " + str(s))

            # 更新数据库
            proxy_item = self.session.query(Proxy).filter(
                Proxy.id == proxy.id).first()
            proxy_item.times = t
            proxy_item.updated_time = datetime.datetime.now()
            if s:
                proxy_item.is_alive = 1

            self.session.add(proxy_item)
        self.session.commit()
    def start(self):
        if not len(self.al.spiders):
            logger.error("No Spiders loaded. exit.")
            sys.exit(1)
        else:
            message = "Loaded spiders: "
            for s in self.al.spiders:
                message += str(s.__class__).split(".")[-1].split("'")[0] + ", "
            logger.info(message.strip(", "))
        # 创建线程池
        if self.spider_threads:
            self.tp = ThreadPool(self.spider_threads)
        else:
            self.tp = ThreadPool()
        for sp in self.al.spiders:
            # 将spider中的run方法添加到线程池中
            self.tp.add_function(sp.run)
        # 开始线程池
        self.tp.run(join=False)

        # 输出结果
        self.sd = SaveData(self.al.results,
                           self.tp,
                           use_file=self.output_file,
                           use_database=self.output_db,
                           filename=self.output_filename)
        if self.save_data_threads:
            self.write_file_tp = ThreadPool(self.save_data_threads)
        else:
            self.write_file_tp = ThreadPool()
        self.write_file_tp = ThreadPool()
        self.write_file_tp.add_function(self.sd.write)
        self.write_file_tp.run()
Exemple #3
0
    def start(self):
        if not len(self.al.spiders):
            logger.error("No Spiders loaded. exit.")
            sys.exit(1)
        else:
            message = "Loaded spiders: "
            for s in self.al.spiders:
                message += str(s.__class__).split(".")[-1].split("'")[0] + ", "
            logger.info(message.strip(", "))
        # 创建线程池
        if self.spider_threads:
            self.tp = ThreadPool(self.spider_threads)
        else:
            self.tp = ThreadPool()
        for sp in self.al.spiders:
            # 将spider中的run方法添加到线程池中
            self.tp.add_function(sp.run)
        # 开始线程池
        self.tp.run(join=False)

        # 输出结果
        self.sd = SaveData(self.al.results, self.tp, use_file=self.output_file, use_database=self.output_db,
                           filename=self.output_filename)
        if self.save_data_threads:
            self.write_file_tp = ThreadPool(self.save_data_threads)
        else:
            self.write_file_tp = ThreadPool()
        self.write_file_tp = ThreadPool()
        self.write_file_tp.add_function(self.sd.write)
        self.write_file_tp.run()
Exemple #4
0
    def check(self, amount=None):
        """
        Check if the proxy address is valid.
        :return: None
        """
        # TODO: 改成多线程检测
        if amount:
            proxy_list = self.session.query(Proxy).filter(Proxy.id <= amount).all()
        else:
            proxy_list = self.session.query(Proxy).all()
        for proxy in proxy_list:
            proxy_ip = proxy.ip
            proxy_port = proxy.port
            logger.info("Testing %s:%s" % (proxy_ip, proxy_port))
            s, t = self.__check_proxy(proxy_ip, proxy_port)
            logger.debug("Time: " + str(t) + " Success: " + str(s))

            # 更新数据库
            proxy_item = self.session.query(Proxy).filter(Proxy.id == proxy.id).first()
            proxy_item.times = t
            proxy_item.updated_time = datetime.datetime.now()
            if s:
                proxy_item.is_alive = 1

            self.session.add(proxy_item)
        self.session.commit()
Exemple #5
0
    def _start(self, target):
        logger.debug("start spider " + target[0])
        deep = target[1]
        target = target[0]

        # 随机取一个phantomjs进程
        phantomjs_tag = random.randint(0, self.phantomjs_count - 1)

        self.driver_pool_lock[phantomjs_tag].acquire()
        retry_times = 2
        while retry_times:
            try:
                self.driver_pool[phantomjs_tag].get(target)
                break
            except:
                # driver.close()
                logger.error("retry %d" % retry_times)
                retry_times -= 1
                if not retry_times:
                    logger.warn("Time out when get %s HTML" % target)
                    self.driver_pool_lock[phantomjs_tag].release()
                    return
                else:
                    continue

        # 获取网页HTML
        raw_html = self.driver_pool[phantomjs_tag].execute_script(
            "return document.getElementsByTagName('html')[0].innerHTML")
        # 获取网页加载过程中发生的HTTP请求
        http_log = json.loads(self.driver_pool[phantomjs_tag].get_log("har")[0]
                              ["message"])["log"]["entries"]
        # 获取当前的页面URL
        base_url = self.driver_pool[phantomjs_tag].current_url
        # 释放锁
        self.driver_pool_lock[phantomjs_tag].release()

        soup = BeautifulSoup(raw_html, "html5lib")
        logger.debug("Get %s HTML done. Deep: %s" % (target, deep))

        # 处理文件中获取的href标签
        for a in soup.find_all("a", href=True):
            url = a['href'].strip()
            # 去掉非URL的部分
            if url.startswith('javascript:') or url.startswith('#') or not url:
                continue
            elif not url.startswith('https://') or not url.startswith(
                    'http://'):
                # 将相对路径转换为绝对路径
                url = urlparse.urljoin(base_url, url)
            self.check_same_url(url, deep, self.filter_similar)

        # 处理打开页面时产生的请求
        for log in http_log:
            url = log['request']['url']
            logger.info(url)
            self.check_same_url(url, deep, self.filter_similar)

        logger.debug("".join(["Raw links: ", str(self.raw_links_num)]))
        logger.debug("".join(["Filter links: ", str(self.filter_links_num)]))
Exemple #6
0
    def _start(self, target):
        logger.debug("start spider " + target[0])
        deep = target[1]
        target = target[0]

        # 随机取一个phantomjs进程
        phantomjs_tag = random.randint(0, self.phantomjs_count-1)

        self.driver_pool_lock[phantomjs_tag].acquire()
        retry_times = 2
        while retry_times:
            try:
                self.driver_pool[phantomjs_tag].get(target)
                break
            except:
                # driver.close()
                logger.error("retry %d" % retry_times)
                retry_times -= 1
                if not retry_times:
                    logger.warn("Time out when get %s HTML" % target)
                    self.driver_pool_lock[phantomjs_tag].release()
                    return
                else:
                    continue

        # 获取网页HTML
        raw_html = self.driver_pool[phantomjs_tag].execute_script(
            "return document.getElementsByTagName('html')[0].innerHTML"
        )
        # 获取网页加载过程中发生的HTTP请求
        http_log = json.loads(self.driver_pool[phantomjs_tag].get_log("har")[0]["message"])["log"]["entries"]
        # 获取当前的页面URL
        base_url = self.driver_pool[phantomjs_tag].current_url
        # 释放锁
        self.driver_pool_lock[phantomjs_tag].release()

        soup = BeautifulSoup(raw_html, "html5lib")
        logger.debug("Get %s HTML done. Deep: %s" % (target, deep))

        # 处理文件中获取的href标签
        for a in soup.find_all("a", href=True):
            url = a['href'].strip()
            # 去掉非URL的部分
            if url.startswith('javascript:') or url.startswith('#') or not url:
                continue
            elif not url.startswith('https://') or not url.startswith('http://'):
                # 将相对路径转换为绝对路径
                url = urlparse.urljoin(base_url, url)
            self.check_same_url(url, deep, self.filter_similar)

        # 处理打开页面时产生的请求
        for log in http_log:
            url = log['request']['url']
            logger.info(url)
            self.check_same_url(url, deep, self.filter_similar)

        logger.debug("".join(["Raw links: ", str(self.raw_links_num)]))
        logger.debug("".join(["Filter links: ", str(self.filter_links_num)]))
Exemple #7
0
 def __write_file(self, res):
     self.ff.writelines(res[0].get('url') + "\n")
     self.ff.writelines("ip,port,type,protocol,location,time(s)\n")
     logger.info("[*] url: " + res[0].get('url'))
     res = res[1]
     for r in res:
         line = r.get('ip', 'None') + "," + r.get('port', 'None') + "," + \
                r.get('type', 'None') + "," + r.get('protocol', 'None') + "," + \
                r.get('location', 'None') + "," + r.get('time', 'None')
         logger.info("[*] " + line)
         self.ff.writelines((line + "\n").encode("utf8"))
Exemple #8
0
 def __write_file(self, res):
     self.ff.writelines(res[0].get('url') + "\n")
     self.ff.writelines("ip,port,type,protocol,location,time(s)\n")
     logger.info("[*] url: " + res[0].get('url'))
     res = res[1]
     for r in res:
         line = r.get('ip', 'None') + "," + r.get('port', 'None') + "," + \
                r.get('type', 'None') + "," + r.get('protocol', 'None') + "," + \
                r.get('location', 'None') + "," + r.get('time', 'None')
         logger.info("[*] " + line)
         self.ff.writelines((line + "\n").encode("utf8"))
Exemple #9
0
    def check_same_url(self, url, deep, filter_similar):

        # 判断URL的后缀是否为图片等
        url_st = urlparse.urlparse(url)
        suffix = url_st.path.split(".")[-1]
        if suffix.lower() in [
                "jpg", "png", "gif", "jpeg", "bmp", "css", "ttf"
        ]:
            return

        self.raw_links_num += 1

        # 先判断域名在不在目标域中
        if self.check_domain_limit(url):
            # 在目标域中,判断参数格式
            # 如果已经在set中,说明之前爬到过类似参数的页面,直接return
            # 如果不在set中,说明之前未出现过,继续向下执行处理,并将其添加到set中
            formatted_url = self.format_url_param(url)
            # logger.warning(formatted_url)
            if formatted_url is not None:
                if formatted_url not in self.url_param_set:
                    self.url_param_set.add(formatted_url)
                else:
                    return

            # 格式化url
            r, suffix = self.format_url(url)
            if suffix:
                # 有后缀,正常页面,根据是否判断相似性的设置继续判断
                if filter_similar and (r not in self.url_set):
                    self.filter_links_num += 1
                    self.url_set.add(r)
                    self.links.append(url)
                    logger.info(url)
                    if deep + 1 <= self.deep:
                        self.task_queue.put((url, deep + 1))
                elif not filter_similar and (url not in self.links):
                    self.filter_links_num += 1
                    self.links.append(url)
                    logger.info(url)
                    if deep + 1 <= self.deep:
                        self.task_queue.put((url, deep + 1))
            else:
                # 没有后缀,是个目录,去重,不去相似
                if url not in self.links:
                    self.filter_links_num += 1
                    self.links.append(url)
                    logger.info(url)
                    if deep + 1 <= self.deep:
                        self.task_queue.put((url, deep + 1))
Exemple #10
0
    def check_same_url(self, url, deep, filter_similar):

        # 判断URL的后缀是否为图片等
        url_st = urlparse.urlparse(url)
        suffix = url_st.path.split(".")[-1]
        if suffix.lower() in ["jpg", "png", "gif", "jpeg", "bmp", "css", "ttf"]:
            return

        self.raw_links_num += 1

        # 先判断域名在不在目标域中
        if self.check_domain_limit(url):
            # 在目标域中,判断参数格式
            # 如果已经在set中,说明之前爬到过类似参数的页面,直接return
            # 如果不在set中,说明之前未出现过,继续向下执行处理,并将其添加到set中
            formatted_url = self.format_url_param(url)
            # logger.warning(formatted_url)
            if formatted_url is not None:
                if formatted_url not in self.url_param_set:
                    self.url_param_set.add(formatted_url)
                else:
                    return

            # 格式化url
            r, suffix = self.format_url(url)
            if suffix:
                # 有后缀,正常页面,根据是否判断相似性的设置继续判断
                if filter_similar and (r not in self.url_set):
                    self.filter_links_num += 1
                    self.url_set.add(r)
                    self.links.append(url)
                    logger.info(url)
                    if deep + 1 <= self.deep:
                        self.task_queue.put((url, deep + 1))
                elif not filter_similar and (url not in self.links):
                    self.filter_links_num += 1
                    self.links.append(url)
                    logger.info(url)
                    if deep + 1 <= self.deep:
                        self.task_queue.put((url, deep + 1))
            else:
                # 没有后缀,是个目录,去重,不去相似
                if url not in self.links:
                    self.filter_links_num += 1
                    self.links.append(url)
                    logger.info(url)
                    if deep + 1 <= self.deep:
                        self.task_queue.put((url, deep + 1))
Exemple #11
0
 def terminate(self):
     self._exit = True
     logger.info("Send exit to WebClicker..Wait for quit.")
Exemple #12
0
    def __init__(self, target, deep=1, limit_domain=list(), thread_count=cpu_count()*2,
                 phantomjs_count=cpu_count(), filter_similar=False):

        # 设置phantomjs路径
        SpiderBase.__init__(self)
        SpiderBase.set_phantomjs_path(self)

        # 设置参数
        self.target = target
        self.deep = deep
        if limit_domain:
            self.limit_domain = limit_domain
        else:
            self.limit_domain = ".".join(tldextract.extract(self.target))
        self.thread_count = thread_count
        self.phantomjs_count = phantomjs_count
        self.filter_similar = filter_similar

        # 去重用的set
        self.url_set = set()
        self.url_param_set = set()
        # 存储爬虫结果的list
        self.links = list()
        # 待爬取的队列
        self.task_queue = Queue.Queue()
        self.spider_pool = None

        # 将初始目标置于待爬取的队列中
        self.task_queue.put((self.target, 0))

        # 统计信息
        self.raw_links_num = 0
        self.filter_links_num = 0
        self.links_num = 0

        # 初始化 webdriver
        # dcap 好像无效
        self.dcap = dict(DesiredCapabilities.PHANTOMJS)
        self.dcap["phantomjs.page.settings.resourceTimeout"] = 10
        self.dcap["phantomjs.page.settings.loadImages"] = False

        self.service_args = [
            "--webdriver-loglevel=DEBUG",
            "--webdriver-logfile=phantomjs.log"
            "--load-images=no",
            "--disk-cache=true"
        ]

        # webdriver进程池
        logger.info("initial web spider phantomjs process pool...")
        self.driver_pool = list()
        self.driver_pool_lock = list()
        for i in range(self.phantomjs_count):
            self.driver_pool.append(
                webdriver.PhantomJS(executable_path=self.phantomjs_path, desired_capabilities=self.dcap,
                                    service_args=self.service_args
                                    )
            )
            self.driver_pool_lock.append(
                threading.Lock()
            )
            logger.info("%.2f%% finished." % ((float(i + 1) * 100) / float(self.phantomjs_count)))
        logger.info("initial finished.")
Exemple #13
0
 def terminate(self):
     self._exit = True
     logger.info("Send exit to WebClicker..Wait for quit.")
Exemple #14
0
    def __init__(self,
                 target,
                 deep=1,
                 limit_domain=list(),
                 thread_count=cpu_count() * 2,
                 phantomjs_count=cpu_count(),
                 filter_similar=False):

        # 设置phantomjs路径
        SpiderBase.__init__(self)
        SpiderBase.set_phantomjs_path(self)

        # 设置参数
        self.target = target
        self.deep = deep
        if limit_domain:
            self.limit_domain = limit_domain
        else:
            self.limit_domain = ".".join(tldextract.extract(self.target))
        self.thread_count = thread_count
        self.phantomjs_count = phantomjs_count
        self.filter_similar = filter_similar

        # 去重用的set
        self.url_set = set()
        self.url_param_set = set()
        # 存储爬虫结果的list
        self.links = list()
        # 待爬取的队列
        self.task_queue = Queue.Queue()
        self.spider_pool = None

        # 将初始目标置于待爬取的队列中
        self.task_queue.put((self.target, 0))

        # 统计信息
        self.raw_links_num = 0
        self.filter_links_num = 0
        self.links_num = 0

        # 初始化 webdriver
        # dcap 好像无效
        self.dcap = dict(DesiredCapabilities.PHANTOMJS)
        self.dcap["phantomjs.page.settings.resourceTimeout"] = 10
        self.dcap["phantomjs.page.settings.loadImages"] = False

        self.service_args = [
            "--webdriver-loglevel=DEBUG", "--webdriver-logfile=phantomjs.log"
            "--load-images=no", "--disk-cache=true"
        ]

        # webdriver进程池
        logger.info("initial web spider phantomjs process pool...")
        self.driver_pool = list()
        self.driver_pool_lock = list()
        for i in range(self.phantomjs_count):
            self.driver_pool.append(
                webdriver.PhantomJS(executable_path=self.phantomjs_path,
                                    desired_capabilities=self.dcap,
                                    service_args=self.service_args))
            self.driver_pool_lock.append(threading.Lock())
            logger.info("%.2f%% finished." %
                        ((float(i + 1) * 100) / float(self.phantomjs_count)))
        logger.info("initial finished.")