Esempio n. 1
0
    def check(self, amount=None):
        """
        Check if the proxy address is valid.
        :return: None
        """
        # TODO: 改成多线程检测
        if amount:
            proxy_list = self.session.query(Proxy).filter(Proxy.id <= amount).all()
        else:
            proxy_list = self.session.query(Proxy).all()
        for proxy in proxy_list:
            proxy_ip = proxy.ip
            proxy_port = proxy.port
            logger.info("Testing %s:%s" % (proxy_ip, proxy_port))
            s, t = self.__check_proxy(proxy_ip, proxy_port)
            logger.debug("Time: " + str(t) + " Success: " + str(s))

            # 更新数据库
            proxy_item = self.session.query(Proxy).filter(Proxy.id == proxy.id).first()
            proxy_item.times = t
            proxy_item.updated_time = datetime.datetime.now()
            if s:
                proxy_item.is_alive = 1

            self.session.add(proxy_item)
        self.session.commit()
Esempio n. 2
0
 def _run(self, join=True):
     for t in self.__thread_list:
         # 等待线程
         while True:
             if self.__get_current_alive_thread_count() < self.__thread_count:
                 break
             else:
                 time.sleep(0.5)
         # 获取到了空闲的位置,从工作列表中删除已经停止的线程
         for tt in self.__working_thread_list:
             if not tt.is_alive():
                 logger.debug("[*] " + tt.getName() + " deleted from working list.")
                 self.__working_thread_list.remove(tt)
         # 等待到了空闲的位置,将该任务添加到工作列表中
         self.__working_thread_list.append(t)
         # 开始线程
         logger.debug("[*] " + t.getName() + " start.")
         t.start()
         if join:
             for tt in self.__working_thread_list:
                 tt.join()
     while True:
         if self.is_all_thread_dead():
             self.finished = True
             break
         else:
             time.sleep(0.5)
Esempio n. 3
0
    def run(self):

        url = self.url
        logger.debug(url)
        driver = webdriver.PhantomJS(executable_path=self.phantomjs_path)
        driver.get(url)
        raw_html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")

        soup = BeautifulSoup(raw_html, "html5lib")
        table = soup.find("tbody")
        t_result = []
        for tr in table.find_all("tr"):
            each_item = dict()
            td = tr.find_all("td")
            each_item['ip'] = td[0].get_text().split(";")[1].split(":")[0]
            each_item['port'] = td[0].get_text().split(";")[1].split(":")[1]
            each_item['type'] = td[1].get_text()
            each_item['location'] = td[2].get_text().strip()
            th = tr.find_all("th")
            each_item['time'] = th[0].get_text()
            t_result.append(each_item)
        result = []
        info = dict()
        info['url'] = self.url
        info['type'] = self.type
        info['tag'] = self.tag
        result.append(info)
        result.append(t_result)
        self.result_queue.put(result)
Esempio n. 4
0
    def run(self):

        url = self.url
        logger.debug(url)
        driver = webdriver.PhantomJS(executable_path=self.phantomjs_path)
        driver.get(url)
        raw_html = driver.execute_script(
            "return document.getElementsByTagName('html')[0].innerHTML")

        soup = BeautifulSoup(raw_html, "html5lib")
        table = soup.find("tbody")
        t_result = []
        for tr in table.find_all("tr"):
            each_item = dict()
            td = tr.find_all("td")
            each_item['ip'] = td[0].get_text().split(";")[1].split(":")[0]
            each_item['port'] = td[0].get_text().split(";")[1].split(":")[1]
            each_item['type'] = td[1].get_text()
            each_item['location'] = td[2].get_text().strip()
            th = tr.find_all("th")
            each_item['time'] = th[0].get_text()
            t_result.append(each_item)
        result = []
        info = dict()
        info['url'] = self.url
        info['type'] = self.type
        info['tag'] = self.tag
        result.append(info)
        result.append(t_result)
        self.result_queue.put(result)
Esempio n. 5
0
    def my_run(self, page):
        raw_url = "http://www.kuaidaili.com/proxylist/{page}/"
        url = raw_url.replace("{page}", str(page))
        logger.debug(url)
        driver = webdriver.PhantomJS(executable_path=self.phantomjs_path)
        driver.get(url)
        raw_html = driver.execute_script(
            "return document.getElementsByTagName('html')[0].innerHTML")

        soup = BeautifulSoup(raw_html, "html5lib")
        t_result = list()
        for tr in soup.find_all("tr")[1:]:
            each_item = {}
            td = tr.find_all("td")

            # 填充数据
            each_item['ip'] = td[0].get_text()
            each_item['port'] = td[1].get_text()
            each_item['type'] = td[2].get_text()
            each_item['protocol'] = td[3].get_text().replace(", ", "-")
            each_item['location'] = td[5].get_text()
            each_item['time'] = filter(lambda ch: ch in '0123456789.',
                                       td[6].get_text().encode("utf8"))
            t_result.append(each_item)
        return t_result
Esempio n. 6
0
    def __loop(self):
        while True:

            if self.exit:
                logger.debug("ThreadPool loop end.")
                break

            if self.current_working_num() >= self.thread_count:
                # 没有空闲位置了
                time.sleep(1)
                continue

            if self.func_list.empty():
                # 没有任务了
                time.sleep(1)
                continue

            # 获取任务并运行
            task = self.func_list.get_nowait()
            try:
                thread_name = str(task[0].im_class).split(".")[-1].split("'")[0]
            except AttributeError:
                thread_name = task[0].__name__
            thread = threading.Thread(target=task[0], args=task[1], kwargs=task[2], name=thread_name)
            thread.start()
            self.working_list.append(thread)
Esempio n. 7
0
    def __write_database(self, res):
        res = res[1]
        for r in res:
            # 先检测数据库中是否存在该IP
            # 如果IP和端口均相同
            # 则认为是重复的数据,不添加到数据库中

            proxy = self.session.query(Proxy).filter_by(
                ip=r.get("ip"), port=r.get("port")).first()
            if proxy:
                proxy.updated_time = datetime.datetime.now()
                try:
                    self.session.add(proxy)
                    self.session.commit()
                except Exception, e:
                    logger.debug("Update database error. " + e.message)
                continue

            new_proxy = Proxy(ip=r.get("ip", "None"),
                              port=r.get("port", "None"),
                              proxy_type=r.get("type", "None"),
                              location=r.get("location", "None"),
                              protocol=r.get("protocol", "None"),
                              times=r.get("time", "None"),
                              is_alive=0,
                              created_time=datetime.datetime.now(),
                              updated_time=datetime.datetime.now())
            try:
                self.session.add(new_proxy)
                self.session.commit()
            except Exception, e:
                logger.debug("Save database error. " + e.message)
Esempio n. 8
0
 def _run(self, join=True):
     for t in self.__thread_list:
         # 等待线程
         while True:
             if self.__get_current_alive_thread_count(
             ) < self.__thread_count:
                 break
             else:
                 time.sleep(0.5)
         # 获取到了空闲的位置,从工作列表中删除已经停止的线程
         for tt in self.__working_thread_list:
             if not tt.is_alive():
                 logger.debug("[*] " + tt.getName() +
                              " deleted from working list.")
                 self.__working_thread_list.remove(tt)
         # 等待到了空闲的位置,将该任务添加到工作列表中
         self.__working_thread_list.append(t)
         # 开始线程
         logger.debug("[*] " + t.getName() + " start.")
         t.start()
         if join:
             for tt in self.__working_thread_list:
                 tt.join()
     while True:
         if self.is_all_thread_dead():
             self.finished = True
             break
         else:
             time.sleep(0.5)
Esempio n. 9
0
    def __loop(self):
        while True:

            if self.exit:
                logger.debug("ThreadPool loop end.")
                break

            if self.current_working_num() >= self.thread_count:
                # 没有空闲位置了
                time.sleep(1)
                continue

            if self.func_list.empty():
                # 没有任务了
                time.sleep(1)
                continue

            # 获取任务并运行
            task = self.func_list.get_nowait()
            try:
                thread_name = str(
                    task[0].im_class).split(".")[-1].split("'")[0]
            except AttributeError:
                thread_name = task[0].__name__
            thread = threading.Thread(target=task[0],
                                      args=task[1],
                                      kwargs=task[2],
                                      name=thread_name)
            thread.start()
            self.working_list.append(thread)
Esempio n. 10
0
    def check(self, amount=None):
        """
        Check if the proxy address is valid.
        :return: None
        """
        # TODO: 改成多线程检测
        if amount:
            proxy_list = self.session.query(Proxy).filter(
                Proxy.id <= amount).all()
        else:
            proxy_list = self.session.query(Proxy).all()
        for proxy in proxy_list:
            proxy_ip = proxy.ip
            proxy_port = proxy.port
            logger.info("Testing %s:%s" % (proxy_ip, proxy_port))
            s, t = self.__check_proxy(proxy_ip, proxy_port)
            logger.debug("Time: " + str(t) + " Success: " + str(s))

            # 更新数据库
            proxy_item = self.session.query(Proxy).filter(
                Proxy.id == proxy.id).first()
            proxy_item.times = t
            proxy_item.updated_time = datetime.datetime.now()
            if s:
                proxy_item.is_alive = 1

            self.session.add(proxy_item)
        self.session.commit()
Esempio n. 11
0
 def is_all_thread_dead(self):
     flags = True
     for t in self.__thread_list:
         if t.is_alive():
             flags = False
         elif t not in self.__dead_threads:
                 logger.debug("[*] " + t.getName() + " finished.")
                 self.__dead_threads.append(t)
     return flags
Esempio n. 12
0
 def is_all_thread_dead(self):
     flags = True
     for t in self.__thread_list:
         if t.is_alive():
             flags = False
         elif t not in self.__dead_threads:
             logger.debug("[*] " + t.getName() + " finished.")
             self.__dead_threads.append(t)
     return flags
Esempio n. 13
0
 def current_working_num(self):
     working = 0
     for thread in self.working_list:
         if thread.isAlive():
             # 线程还活着
             working += 1
         else:
             # 线程已经结束了
             logger.debug("Thread %s end." % thread.name)
             self.working_list.remove(thread)
     self.working_thread_number = working
     return working
Esempio n. 14
0
 def __write_database(self, res):
     res = res[1]
     for r in res:
         new_proxy = Proxy(ip=r.get("ip", "None"), port=r.get("port", "None"), proxy_type=r.get("type", "None"),
                           location=r.get("location", "None"), protocol=r.get("protocol", "None"),
                           times=r.get("time", "None"), created_time=datetime.datetime.now(),
                           updated_time=datetime.datetime.now())
         try:
             self.session.add(new_proxy)
             self.session.commit()
         except Exception, e:
             logger.debug("save database error. " + e.message)
Esempio n. 15
0
 def current_working_num(self):
     working = 0
     for thread in self.working_list:
         if thread.isAlive():
             # 线程还活着
             working += 1
         else:
             # 线程已经结束了
             logger.debug("Thread %s end." % thread.name)
             self.working_list.remove(thread)
     self.working_thread_number = working
     return working
Esempio n. 16
0
    def _start(self, target):
        logger.debug("start spider " + target[0])
        deep = target[1]
        target = target[0]

        # 随机取一个phantomjs进程
        phantomjs_tag = random.randint(0, self.phantomjs_count - 1)

        self.driver_pool_lock[phantomjs_tag].acquire()
        retry_times = 2
        while retry_times:
            try:
                self.driver_pool[phantomjs_tag].get(target)
                break
            except:
                # driver.close()
                logger.error("retry %d" % retry_times)
                retry_times -= 1
                if not retry_times:
                    logger.warn("Time out when get %s HTML" % target)
                    self.driver_pool_lock[phantomjs_tag].release()
                    return
                else:
                    continue

        # 获取网页HTML
        raw_html = self.driver_pool[phantomjs_tag].execute_script(
            "return document.getElementsByTagName('html')[0].innerHTML")
        # 获取网页加载过程中发生的HTTP请求
        http_log = json.loads(self.driver_pool[phantomjs_tag].get_log("har")[0]
                              ["message"])["log"]["entries"]
        # 获取当前的页面URL
        base_url = self.driver_pool[phantomjs_tag].current_url
        # 释放锁
        self.driver_pool_lock[phantomjs_tag].release()

        soup = BeautifulSoup(raw_html, "html5lib")
        logger.debug("Get %s HTML done. Deep: %s" % (target, deep))

        # 处理文件中获取的href标签
        for a in soup.find_all("a", href=True):
            url = a['href'].strip()
            # 去掉非URL的部分
            if url.startswith('javascript:') or url.startswith('#') or not url:
                continue
            elif not url.startswith('https://') or not url.startswith(
                    'http://'):
                # 将相对路径转换为绝对路径
                url = urlparse.urljoin(base_url, url)
            self.check_same_url(url, deep, self.filter_similar)

        # 处理打开页面时产生的请求
        for log in http_log:
            url = log['request']['url']
            logger.info(url)
            self.check_same_url(url, deep, self.filter_similar)

        logger.debug("".join(["Raw links: ", str(self.raw_links_num)]))
        logger.debug("".join(["Filter links: ", str(self.filter_links_num)]))
Esempio n. 17
0
    def _start(self, target):
        logger.debug("start spider " + target[0])
        deep = target[1]
        target = target[0]

        # 随机取一个phantomjs进程
        phantomjs_tag = random.randint(0, self.phantomjs_count-1)

        self.driver_pool_lock[phantomjs_tag].acquire()
        retry_times = 2
        while retry_times:
            try:
                self.driver_pool[phantomjs_tag].get(target)
                break
            except:
                # driver.close()
                logger.error("retry %d" % retry_times)
                retry_times -= 1
                if not retry_times:
                    logger.warn("Time out when get %s HTML" % target)
                    self.driver_pool_lock[phantomjs_tag].release()
                    return
                else:
                    continue

        # 获取网页HTML
        raw_html = self.driver_pool[phantomjs_tag].execute_script(
            "return document.getElementsByTagName('html')[0].innerHTML"
        )
        # 获取网页加载过程中发生的HTTP请求
        http_log = json.loads(self.driver_pool[phantomjs_tag].get_log("har")[0]["message"])["log"]["entries"]
        # 获取当前的页面URL
        base_url = self.driver_pool[phantomjs_tag].current_url
        # 释放锁
        self.driver_pool_lock[phantomjs_tag].release()

        soup = BeautifulSoup(raw_html, "html5lib")
        logger.debug("Get %s HTML done. Deep: %s" % (target, deep))

        # 处理文件中获取的href标签
        for a in soup.find_all("a", href=True):
            url = a['href'].strip()
            # 去掉非URL的部分
            if url.startswith('javascript:') or url.startswith('#') or not url:
                continue
            elif not url.startswith('https://') or not url.startswith('http://'):
                # 将相对路径转换为绝对路径
                url = urlparse.urljoin(base_url, url)
            self.check_same_url(url, deep, self.filter_similar)

        # 处理打开页面时产生的请求
        for log in http_log:
            url = log['request']['url']
            logger.info(url)
            self.check_same_url(url, deep, self.filter_similar)

        logger.debug("".join(["Raw links: ", str(self.raw_links_num)]))
        logger.debug("".join(["Filter links: ", str(self.filter_links_num)]))
Esempio n. 18
0
    def start(self):
        logger.debug("start of web spider.")

        # 开始线程池,并且开启了线程分发器
        self.spider_pool = ThreadPool(self.thread_count)
        # 开始爬取第一个页面
        self.spider_pool.add_func(self._start,
                                  target=self.task_queue.get_nowait())
        while True:

            if (not self.spider_pool.working_thread_number
                ) and self.task_queue.empty():
                time.sleep(2)
                if (not self.spider_pool.working_thread_number
                    ) and self.task_queue.empty():
                    self.spider_pool.terminated()
                    logger.debug("WebSpider loop end.")
                    break

            if self.task_queue.empty():
                time.sleep(1)
                continue

            target = self.task_queue.get_nowait()
            self.spider_pool.add_func(self._start,
                                      target=(target[0], target[1]))
            time.sleep(0.1)

        logger.debug("end of web spider")
Esempio n. 19
0
    def start(self):
        logger.debug("start of web spider.")

        # 开始线程池,并且开启了线程分发器
        self.spider_pool = ThreadPool(self.thread_count)
        # 开始爬取第一个页面
        self.spider_pool.add_func(self._start, target=self.task_queue.get_nowait())
        while True:

            if (not self.spider_pool.working_thread_number) and self.task_queue.empty():
                time.sleep(2)
                if (not self.spider_pool.working_thread_number) and self.task_queue.empty():
                    self.spider_pool.terminated()
                    logger.debug("WebSpider loop end.")
                    break

            if self.task_queue.empty():
                time.sleep(1)
                continue

            target = self.task_queue.get_nowait()
            self.spider_pool.add_func(self._start, target=(target[0], target[1]))
            time.sleep(0.1)

        logger.debug("end of web spider")
Esempio n. 20
0
    def my_run(self, page):
        raw_url = "http://www.kuaidaili.com/proxylist/{page}/"
        url = raw_url.replace("{page}", str(page))
        logger.debug(url)
        driver = webdriver.PhantomJS(executable_path=self.phantomjs_path)
        driver.get(url)
        raw_html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")

        soup = BeautifulSoup(raw_html, "html5lib")
        t_result = list()
        for tr in soup.find_all("tr")[1:]:
            each_item = {}
            td = tr.find_all("td")

            # 填充数据
            each_item['ip'] = td[0].get_text()
            each_item['port'] = td[1].get_text()
            each_item['type'] = td[2].get_text()
            each_item['protocol'] = td[3].get_text().replace(", ", "-")
            each_item['location'] = td[5].get_text()
            each_item['time'] = filter(lambda ch: ch in '0123456789.', td[6].get_text().encode("utf8"))
            t_result.append(each_item)
        return t_result
Esempio n. 21
0
web_spider = WebSpider(
    target="http://www.yundaex.com/",
    limit_domain=['*.yundaex.com'],
    deep=5,
    thread_count=50
)
web_spider.do_spider()
# web_spider.start()
# while True:
#     time.sleep(1)
#     print web_spider.links
# time.sleep(1)
while True:
    time.sleep(5)
    logger.debug("Alive thread: %d" % web_spider.spider_pool.working_thread_number)
    logger.debug("Left tasks number: %d" % web_spider.task_queue.qsize())
    logger.debug("links num before filter: %d" % web_spider.raw_links_num)
    logger.debug("links num after filter: %d" % web_spider.filter_links_num)

    if web_spider.spider_pool.working_thread_number == 0:
        break
#
# print web_spider.links
with open("urls.txt", "w") as ff:
    for url in web_spider.links:
        ff.write(url.decode('utf8') + "\n")

# urls = ['www.lightless.me', 'www.baidu.com']
# jobs = [gevent.spawn(socket.gethostbyname, url) for url in urls]
# # gevent.joinall(jobs)
Esempio n. 22
0
import ConfigParser

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from utils.data.LoggerHelp import logger
from utils.data.Tables import Proxy

__author__ = "lightless"
__email__ = "*****@*****.**"


if __name__ == "__main__":
    cf = ConfigParser.ConfigParser()
    cf.read("config.ini")
    db_name = cf.get("ProxySpider", "database")
    username = cf.get(db_name, "username")
    password = cf.get(db_name, "password")
    host = cf.get(db_name, "host")
    database = cf.get(db_name, "database")

    engine = create_engine("mysql://" + username + ":" + password + "@" + host + "/" + database)
    db_session = sessionmaker(bind=engine)
    try:
        Proxy.metadata.create_all(engine)
        logger.debug("Tables create success.")
    except Exception, e:
        logger.error(e.message)