Example #1
0
    def run(self):

        url = self.url
        logger.debug(url)
        driver = webdriver.PhantomJS(executable_path=self.phantomjs_path)
        driver.get(url)
        raw_html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")

        soup = BeautifulSoup(raw_html, "html5lib")
        table = soup.find("tbody")
        t_result = []
        for tr in table.find_all("tr"):
            each_item = dict()
            td = tr.find_all("td")
            each_item['ip'] = td[0].get_text().split(";")[1].split(":")[0]
            each_item['port'] = td[0].get_text().split(";")[1].split(":")[1]
            each_item['type'] = td[1].get_text()
            each_item['location'] = td[2].get_text().strip()
            th = tr.find_all("th")
            each_item['time'] = th[0].get_text()
            t_result.append(each_item)
        result = []
        info = dict()
        info['url'] = self.url
        info['type'] = self.type
        info['tag'] = self.tag
        result.append(info)
        result.append(t_result)
        self.result_queue.put(result)
Example #2
0
 def _run(self, join=True):
     for t in self.__thread_list:
         # 等待线程
         while True:
             if self.__get_current_alive_thread_count() < self.__thread_count:
                 break
             else:
                 time.sleep(0.5)
         # 获取到了空闲的位置,从工作列表中删除已经停止的线程
         for tt in self.__working_thread_list:
             if not tt.is_alive():
                 logger.debug("[*] " + tt.getName() + " deleted from working list.")
                 self.__working_thread_list.remove(tt)
         # 等待到了空闲的位置,将该任务添加到工作列表中
         self.__working_thread_list.append(t)
         # 开始线程
         logger.debug("[*] " + t.getName() + " start.")
         t.start()
         if join:
             for tt in self.__working_thread_list:
                 tt.join()
     while True:
         if self.is_all_thread_dead():
             self.finished = True
             break
         else:
             time.sleep(0.5)
Example #3
0
    def __write_database(self, res):
        res = res[1]
        for r in res:
            # 先检测数据库中是否存在该IP
            # 如果IP和端口均相同
            # 则认为是重复的数据,不添加到数据库中

            proxy = self.session.query(Proxy).filter_by(ip=r.get("ip"), port=r.get("port")).first()
            if proxy:
                proxy.updated_time = datetime.datetime.now()
                try:
                    self.session.add(proxy)
                    self.session.commit()
                except Exception, e:
                    logger.debug("Update database error. " + e.message)
                continue

            new_proxy = Proxy(ip=r.get("ip", "None"), port=r.get("port", "None"), proxy_type=r.get("type", "None"),
                              location=r.get("location", "None"), protocol=r.get("protocol", "None"),
                              times=r.get("time", "None"), is_alive=0, created_time=datetime.datetime.now(),
                              updated_time=datetime.datetime.now())
            try:
                self.session.add(new_proxy)
                self.session.commit()
            except Exception, e:
                logger.debug("Save database error. " + e.message)
Example #4
0
    def run(self):

        url = self.url
        logger.debug(url)
        driver = webdriver.PhantomJS(executable_path=self.phantomjs_path)
        driver.get(url)
        raw_html = driver.execute_script(
            "return document.getElementsByTagName('html')[0].innerHTML")

        soup = BeautifulSoup(raw_html, "html5lib")
        table = soup.find("tbody")
        t_result = []
        for tr in table.find_all("tr"):
            each_item = dict()
            td = tr.find_all("td")
            each_item['ip'] = td[0].get_text().split(";")[1].split(":")[0]
            each_item['port'] = td[0].get_text().split(";")[1].split(":")[1]
            each_item['type'] = td[1].get_text()
            each_item['location'] = td[2].get_text().strip()
            th = tr.find_all("th")
            each_item['time'] = th[0].get_text()
            t_result.append(each_item)
        result = []
        info = dict()
        info['url'] = self.url
        info['type'] = self.type
        info['tag'] = self.tag
        result.append(info)
        result.append(t_result)
        self.result_queue.put(result)
Example #5
0
 def _run(self, join=True):
     for t in self.__thread_list:
         # 等待线程
         while True:
             if self.__get_current_alive_thread_count(
             ) < self.__thread_count:
                 break
             else:
                 time.sleep(0.5)
         # 获取到了空闲的位置,从工作列表中删除已经停止的线程
         for tt in self.__working_thread_list:
             if not tt.is_alive():
                 logger.debug("[*] " + tt.getName() +
                              " deleted from working list.")
                 self.__working_thread_list.remove(tt)
         # 等待到了空闲的位置,将该任务添加到工作列表中
         self.__working_thread_list.append(t)
         # 开始线程
         logger.debug("[*] " + t.getName() + " start.")
         t.start()
         if join:
             for tt in self.__working_thread_list:
                 tt.join()
     while True:
         if self.is_all_thread_dead():
             self.finished = True
             break
         else:
             time.sleep(0.5)
Example #6
0
    def __write_database(self, res):
        res = res[1]
        for r in res:
            # 先检测数据库中是否存在该IP
            # 如果IP和端口均相同
            # 则认为是重复的数据,不添加到数据库中

            proxy = self.session.query(Proxy).filter_by(
                ip=r.get("ip"), port=r.get("port")).first()
            if proxy:
                proxy.updated_time = datetime.datetime.now()
                try:
                    self.session.add(proxy)
                    self.session.commit()
                except Exception, e:
                    logger.debug("Update database error. " + e.message)
                continue

            new_proxy = Proxy(ip=r.get("ip", "None"),
                              port=r.get("port", "None"),
                              proxy_type=r.get("type", "None"),
                              location=r.get("location", "None"),
                              protocol=r.get("protocol", "None"),
                              times=r.get("time", "None"),
                              is_alive=0,
                              created_time=datetime.datetime.now(),
                              updated_time=datetime.datetime.now())
            try:
                self.session.add(new_proxy)
                self.session.commit()
            except Exception, e:
                logger.debug("Save database error. " + e.message)
Example #7
0
    def my_run(self, page):
        raw_url = "http://www.kuaidaili.com/proxylist/{page}/"
        url = raw_url.replace("{page}", str(page))
        logger.debug(url)
        driver = webdriver.PhantomJS(executable_path=self.phantomjs_path)
        driver.get(url)
        raw_html = driver.execute_script(
            "return document.getElementsByTagName('html')[0].innerHTML")

        soup = BeautifulSoup(raw_html, "html5lib")
        t_result = list()
        for tr in soup.find_all("tr")[1:]:
            each_item = {}
            td = tr.find_all("td")

            # 填充数据
            each_item['ip'] = td[0].get_text()
            each_item['port'] = td[1].get_text()
            each_item['type'] = td[2].get_text()
            each_item['protocol'] = td[3].get_text().replace(", ", "-")
            each_item['location'] = td[5].get_text()
            each_item['time'] = filter(lambda ch: ch in '0123456789.',
                                       td[6].get_text().encode("utf8"))
            t_result.append(each_item)
        return t_result
Example #8
0
 def is_all_thread_dead(self):
     flags = True
     for t in self.__thread_list:
         if t.is_alive():
             flags = False
         elif t not in self.__dead_threads:
                 logger.debug("[*] " + t.getName() + " finished.")
                 self.__dead_threads.append(t)
     return flags
Example #9
0
 def is_all_thread_dead(self):
     flags = True
     for t in self.__thread_list:
         if t.is_alive():
             flags = False
         elif t not in self.__dead_threads:
             logger.debug("[*] " + t.getName() + " finished.")
             self.__dead_threads.append(t)
     return flags
Example #10
0
 def current_working_num(self):
     working = 0
     for thread in self.working_list:
         if thread.isAlive():
             # 线程还活着
             working += 1
         else:
             # 线程已经结束了
             logger.debug("Thread %s end." % thread.name)
             self.working_list.remove(thread)
             self.dead_thread_number += 1
     self.working_thread_number = working
     return working
Example #11
0
    def __loop(self):
        while True:

            if self.exit:
                logger.debug("ThreadPool loop end.")
                break

            if self.joined and self.all_thread_number == self.dead_thread_number:
                self.terminated()

            if self.current_working_num() >= self.thread_count:
                # 没有空闲位置了
                time.sleep(1)
                logger.debug("No more place.") if self.DEBUG else None
                continue

            if self.func_list.empty():
                # 没有任务了
                time.sleep(1)
                logger.debug("No more task.") if self.DEBUG else None
                continue

            # 获取任务并运行
            task = self.func_list.get_nowait()
            try:
                thread_name = str(task[0].im_class).split(".")[-1].split("'")[0]
            except AttributeError:
                thread_name = task[0].__name__
            thread = threading.Thread(target=task[0], args=task[1], kwargs=task[2], name=thread_name)
            thread.start()
            logger.debug(thread_name + " start.") if self.DEBUG else None
            self.working_list.append(thread)
Example #12
0
    def _start(self, target):
        logger.debug("start spider " + target[0])
        deep = target[1]
        target = target[0]

        # 随机取一个phantomjs进程
        phantomjs_tag = random.randint(0, self.phantomjs_count - 1)

        self.driver_pool_lock[phantomjs_tag].acquire()
        retry_times = 2
        while retry_times:
            try:
                self.driver_pool[phantomjs_tag].get(target)
                break
            except:
                # driver.close()
                logger.error("retry %d" % retry_times)
                retry_times -= 1
                if not retry_times:
                    logger.warn("Time out when get %s HTML" % target)
                    self.driver_pool_lock[phantomjs_tag].release()
                    return
                else:
                    continue

        # 获取网页HTML
        raw_html = self.driver_pool[phantomjs_tag].execute_script(
            "return document.getElementsByTagName('html')[0].innerHTML")
        # 获取网页加载过程中发生的HTTP请求
        http_log = json.loads(self.driver_pool[phantomjs_tag].get_log("har")[0]
                              ["message"])["log"]["entries"]
        # 获取当前的页面URL
        base_url = self.driver_pool[phantomjs_tag].current_url
        # 释放锁
        self.driver_pool_lock[phantomjs_tag].release()

        soup = BeautifulSoup(raw_html, "html5lib")
        logger.debug("Get %s HTML done. Deep: %s" % (target, deep))

        # 处理文件中获取的href标签
        for a in soup.find_all("a", href=True):
            url = a['href'].strip()
            # 去掉非URL的部分
            if url.startswith('javascript:') or url.startswith('#') or not url:
                continue
            elif not url.startswith('https://') or not url.startswith(
                    'http://'):
                # 将相对路径转换为绝对路径
                url = urlparse.urljoin(base_url, url)
            self.check_same_url(url, deep, self.filter_similar)

        # 处理打开页面时产生的请求
        for log in http_log:
            url = log['request']['url']
            logger.info(url)
            self.check_same_url(url, deep, self.filter_similar)

        logger.debug("".join(["Raw links: ", str(self.raw_links_num)]))
        logger.debug("".join(["Filter links: ", str(self.filter_links_num)]))
Example #13
0
    def _start(self, target):
        logger.debug("start spider " + target[0])
        deep = target[1]
        target = target[0]

        # 随机取一个phantomjs进程
        phantomjs_tag = random.randint(0, self.phantomjs_count-1)

        self.driver_pool_lock[phantomjs_tag].acquire()
        retry_times = 2
        while retry_times:
            try:
                self.driver_pool[phantomjs_tag].get(target)
                break
            except:
                # driver.close()
                logger.error("retry %d" % retry_times)
                retry_times -= 1
                if not retry_times:
                    logger.warn("Time out when get %s HTML" % target)
                    self.driver_pool_lock[phantomjs_tag].release()
                    return
                else:
                    continue

        # 获取网页HTML
        raw_html = self.driver_pool[phantomjs_tag].execute_script(
            "return document.getElementsByTagName('html')[0].innerHTML"
        )
        # 获取网页加载过程中发生的HTTP请求
        http_log = json.loads(self.driver_pool[phantomjs_tag].get_log("har")[0]["message"])["log"]["entries"]
        # 获取当前的页面URL
        base_url = self.driver_pool[phantomjs_tag].current_url
        # 释放锁
        self.driver_pool_lock[phantomjs_tag].release()

        soup = BeautifulSoup(raw_html, "html5lib")
        logger.debug("Get %s HTML done. Deep: %s" % (target, deep))

        # 处理文件中获取的href标签
        for a in soup.find_all("a", href=True):
            url = a['href'].strip()
            # 去掉非URL的部分
            if url.startswith('javascript:') or url.startswith('#') or not url:
                continue
            elif not url.startswith('https://') or not url.startswith('http://'):
                # 将相对路径转换为绝对路径
                url = urlparse.urljoin(base_url, url)
            self.check_same_url(url, deep, self.filter_similar)

        # 处理打开页面时产生的请求
        for log in http_log:
            url = log['request']['url']
            logger.info(url)
            self.check_same_url(url, deep, self.filter_similar)

        logger.debug("".join(["Raw links: ", str(self.raw_links_num)]))
        logger.debug("".join(["Filter links: ", str(self.filter_links_num)]))
Example #14
0
    def start(self):
        logger.debug("start of web spider.")

        # 开始线程池,并且开启了线程分发器
        self.spider_pool = ThreadPool(self.thread_count)
        # 开始爬取第一个页面
        self.spider_pool.add_func(self._start, target=self.task_queue.get_nowait())
        while True:

            if (not self.spider_pool.working_thread_number) and self.task_queue.empty():
                time.sleep(2)
                if (not self.spider_pool.working_thread_number) and self.task_queue.empty():
                    self.spider_pool.terminated()
                    logger.debug("WebSpider loop end.")
                    break

            if self.task_queue.empty():
                time.sleep(1)
                continue

            target = self.task_queue.get_nowait()
            self.spider_pool.add_func(self._start, target=(target[0], target[1]))
            time.sleep(0.1)

        logger.debug("end of web spider")
Example #15
0
    def start(self):
        logger.debug("start of web spider.")

        # 开始线程池,并且开启了线程分发器
        self.spider_pool = ThreadPool(self.thread_count)
        # 开始爬取第一个页面
        self.spider_pool.add_func(self._start,
                                  target=self.task_queue.get_nowait())
        while True:

            if (not self.spider_pool.working_thread_number
                ) and self.task_queue.empty():
                time.sleep(2)
                if (not self.spider_pool.working_thread_number
                    ) and self.task_queue.empty():
                    self.spider_pool.terminated()
                    logger.debug("WebSpider loop end.")
                    break

            if self.task_queue.empty():
                time.sleep(1)
                continue

            target = self.task_queue.get_nowait()
            self.spider_pool.add_func(self._start,
                                      target=(target[0], target[1]))
            time.sleep(0.1)

        logger.debug("end of web spider")
Example #16
0
    def _check(self, ip, port, save_to_queue=False):
        """
        检测给定的代理IP和端口是否存活
        :param ip: 代理IP
        :param port: 代理端口
        :param save_to_queue: 如果设置为True,则存储到结果队列中,否则不存储,默认为False
        :return: success, delay 如果目标代理存活,则success为True且delay为延迟,否则为False,delay为0
        """
        # 检查参数合法性
        if ip == "" or port == "":
            logger.error("Invalid ip or port found. Skipping...")
            return False, -1.0

        # 3次重试机会
        retry = 3
        time_summary = 0.0
        success = False
        while retry:
            logger.debug("Times: {0}. Trying {1}:{2} connection...".format(
                3 - retry + 1, ip, port))
            proxies = {'http': ip + ":" + port}

            try:
                time_start = time.time()
                requests.get("http://ip.cn/",
                             headers=self.headers,
                             proxies=proxies,
                             timeout=10)
                time_summary = time.time() - time_start
                success = True
                break
            except requests.RequestException:
                logger.warning("{0}:{1} proxy time out.".format(ip, port))
                continue
            finally:
                retry -= 1
        if save_to_queue:
            self.result_queue.put((ip, port, success, time_summary))
        return success, time_summary
Example #17
0
    def _check(self, ip, port, save_to_queue=False):
        """
        检测给定的代理IP和端口是否存活
        :param ip: 代理IP
        :param port: 代理端口
        :param save_to_queue: 如果设置为True,则存储到结果队列中,否则不存储,默认为False
        :return: success, delay 如果目标代理存活,则success为True且delay为延迟,否则为False,delay为0
        """
        # 检查参数合法性
        if ip == "" or port == "":
            logger.error("Invalid ip or port found. Skipping...")
            return False, -1.0

        # 3次重试机会
        retry = 3
        time_summary = 0.0
        success = False
        while retry:
            logger.debug("Times: {0}. Trying {1}:{2} connection...".format(3-retry+1, ip, port))
            proxies = {
                'http': ip + ":" + port
            }

            try:
                time_start = time.time()
                requests.get("http://ip.cn/", headers=self.headers, proxies=proxies, timeout=10)
                time_summary = time.time() - time_start
                success = True
                break
            except requests.RequestException:
                logger.warning("{0}:{1} proxy time out.".format(ip, port))
                continue
            finally:
                retry -= 1
        if save_to_queue:
            self.result_queue.put((ip, port, success, time_summary))
        return success, time_summary
Example #18
0
#!/usr/bin/env python2
# coding: utf-8
import ConfigParser

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from utils.Data.LoggerHelp import logger
from utils.Data.Tables import Proxy

__author__ = "lightless"
__email__ = "*****@*****.**"

if __name__ == "__main__":
    cf = ConfigParser.ConfigParser()
    cf.read("config.ini")
    db_name = cf.get("Pansidong", "database")
    username = cf.get(db_name, "username")
    password = cf.get(db_name, "password")
    host = cf.get(db_name, "host")
    database = cf.get(db_name, "database")

    engine = create_engine("mysql://" + username + ":" + password + "@" +
                           host + "/" + database)
    db_session = sessionmaker(bind=engine)
    try:
        Proxy.metadata.create_all(engine)
        logger.debug("Tables create success.")
    except Exception, e:
        logger.error(e.message)
Example #19
0
    def start_parse(self):
        # --version
        if self.command_args.version:
            print Version
            sys.exit(0)

        # --update-proxy-db
        if self.command_args.update_proxy_db:
            logger.debug("Update Proxy DB selected.")
            ps = ProxySpider.ProxySpider()
            ps.load()
            ps.start()
            sys.exit(0)

        # --check-proxy
        if self.command_args.check_proxy:
            logger.debug("Check proxy selected.")
            ips = self.command_args.check_proxy
            logger.debug(ips)
            pm = ProxyManage.ProxyManage(ips=ips)
            pm.check()
            sys.exit(0)

        # --check-proxy-all
        if self.command_args.check_proxy_all:
            logger.debug("Check all proxy selected.")
            pm = ProxyManage.ProxyManage(all=True)
            pm.check()
            sys.exit(0)

        # --get-alive-proxy
        if self.command_args.get_alive_proxy:
            logger.debug("Get alive proxy selected.")
            logger.debug(self.command_args.get_alive_proxy)
            pm = ProxyManage.ProxyManage()
            params = self.command_args.get_alive_proxy
            if "," in params:
                amount = params.split(",")[0].strip()
                delay = params.split(",")[1].strip()
                pm.get_alive_proxy(amount, delay)
            else:
                pm.get_alive_proxy(params.strip())

        # --clean-db
        if self.command_args.clean_db:
            logger.debug("Clean db selected.")
            pm = ProxyManage.ProxyManage()
            pm.clean_dead_proxy()
Example #20
0
import ConfigParser

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from utils.Data.LoggerHelp import logger
from utils.Data.Tables import Proxy

__author__ = "lightless"
__email__ = "*****@*****.**"


if __name__ == "__main__":
    cf = ConfigParser.ConfigParser()
    cf.read("config.ini")
    db_name = cf.get("Pansidong", "database")
    username = cf.get(db_name, "username")
    password = cf.get(db_name, "password")
    host = cf.get(db_name, "host")
    database = cf.get(db_name, "database")

    engine = create_engine("mysql://" + username + ":" + password + "@" + host + "/" + database)
    db_session = sessionmaker(bind=engine)
    try:
        Proxy.metadata.create_all(engine)
        logger.debug("Tables create success.")
    except Exception, e:
        logger.error(e.message)