コード例 #1
0
 def run(self):
     headers = {
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) '
         'AppleWebKit/537.36 (KHTML, like Gecko)'
         'Chrome/52.0.2743.116 Safari/537.36'
     }
     if self.proxy:
         proxies = {"https": "https://" + self.proxy.ip_port()}
     else:
         proxies = None
     # log_writer(proxies)
     try:
         self.__start_time = time.time()
         # log_writer('start to get', self.url)
         self.session = requests.get(self.url,
                                     headers=headers,
                                     proxies=proxies,
                                     timeout=self.timeout)
         finish_time = time.time()
         self.delta_time = round(finish_time - self.__start_time, 3)
         if self.session.status_code != 200:
             self.delta_time = -1
     except requests.exceptions.Timeout:
         self.delta_time = -1
     except requests.exceptions.ProxyError:
         self.delta_time = -1
     except requests.exceptions.ConnectionError:
         self.delta_time = -1
     except Exception as e:
         log_writer(type(e))
         log_writer(e)
コード例 #2
0
    def run(self):
        """
        The entrance of the process.
        This process has three threads:
        1.u_listener: used to listen for urls to crawl, and start the crawler
        2.th_rmd: used to searching the completed crawler
        3.h_listener: used to listen for html to parse
        """
        self.url_queue = Queue(maxsize=self.maxsize_queue)
        self.html_queue = Queue(maxsize=self.maxsize_queue)
        u_listener = Thread(target=self.url_listener, name='crawler_creator')
        u_listener.start()
        log_writer('url listener start')
        th_rmd = Thread(target=self.remove_death, name='crawler_sweeper')
        th_rmd.start()
        log_writer('spider sweeper start')
        h_listener = Thread(target=self.html_listener, name='parse_listener')
        h_listener.start()
        log_writer('parse listener start')

        while True:
            log_writer('Crawler Manager wakeup')
            for u in self.url_getter():
                while self.url_queue.qsize() + 1 >= self.maxsize_queue:
                    # log_writer('current url:', self.url_queue.qsize())
                    continue
                self.url_queue.put(u)
            log_writer('Crawler Manager going to sleep', self.interval_time,
                       'sec')
            time.sleep(self.interval_time)
コード例 #3
0
ファイル: ProxyManager.py プロジェクト: Tsangkam/proxy_pool
 def verify_control(self):
     log_writer("verify control start")
     while self.__db:
         proxies_info = self.get_rarely_proxy()
         for proxy_info in proxies_info:
             self.__verify_queue.put(
                 Proxy(proxy_info[0], proxy_info[1], proxy_info[2]))
コード例 #4
0
 def remove_death(self):
     global SPIDER_Semaphore
     while True:
         for v in self.verify_pool:
             if not v.is_alive():
                 self.__feedback_queue.put(v.feedback())
                 log_writer(v.proxy.id, v.delta_time)
                 self.verify_pool.remove(v)
                 SPIDER_Semaphore.release()
         time.sleep(1)
コード例 #5
0
 def url_listener(self):
     """
     used to listen the url to crawler, and start a crawler as a thread
     """
     start_time = time.time()
     while True:
         SPIDER_Semaphore.acquire()
         # if there is not useful proxy in the proxy queue, use local ip
         try:
             proxy = self.__proxy_queue.get(
                 timeout=self.base_time +
                 self.multiple_time * self.time_of_using_local_ip)
         except Empty:
             proxy = None
             self.time_of_using_local_ip += 1
             log_writer("using local ip", self.time_of_using_local_ip)
         c = Crawler(url=self.url_queue.get(), proxy=proxy)
         c.start()
         self.crawler_pool.append(c)
コード例 #6
0
 def remove_death(self):
     global SPIDER_Semaphore
     while True:
         for c in self.crawler_pool:
             if not c.is_alive():  # checking the complete crawler
                 SPIDER_Semaphore.release()
                 if not c.feedback(
                 ) is None:  # feedback while not local ip address is used
                     self.__feedback_queue.put(c.feedback())
                 if c.delta_time <= 0:  # if the crawler failed, put the url into the url queue again
                     self.url_queue.put(c.url)
                     log_writer(c.url,
                                c.proxy.id if c.proxy else 'local ip',
                                'failed')
                 else:  # else put the html text into the html queue
                     self.html_queue.put(c.session.text)
                     log_writer(c.url,
                                c.proxy.id if c.proxy else 'local ip',
                                'succeed')
                 self.crawler_pool.remove(c)
コード例 #7
0
 def run(self):
     global SPIDER_Semaphore
     rmd = Thread(target=self.remove_death)
     rmd.start()
     start_time = time.time()
     while True:
         for url in TEST_URL:
             SPIDER_Semaphore.acquire()
             try:
                 v = CrawlerManager.Crawler(url=url,
                                            proxy=self.__verify_queue.get(timeout=self.interval_time/10))
                 v.start()
                 self.verify_pool.append(v)
             except Empty:
                 continue
             finally:
                 end_time = time.time()
                 if end_time - start_time >= self.interval_time:
                     log_writer("verify manager sleeping", self.sleep_time, 'sec')
                     time.sleep(self.sleep_time)
                     start_time = time.time()
                     log_writer("verify manager wakeup")
コード例 #8
0
ファイル: ProxyManager.py プロジェクト: Tsangkam/proxy_pool
 def getter_filler(self):
     log_writer("filler start")
     fail_times = 0
     while self.__db:
         proxies_info = self.fill_getter()
         log_writer('put', len(proxies_info), 'proxies')
         if not proxies_info:
             fail_times += 1
             log_writer("no suit proxy", fail_times)
             time.sleep(self.multiple_timeout * fail_times)
             continue
         for proxy_info in proxies_info:
             self.__getter_queue.put(
                 Proxy(proxy_info[0], proxy_info[1], proxy_info[2]))
コード例 #9
0
ファイル: SquidModify.py プロジェクト: Tsangkam/proxy_pool
def squid_modify(proxy_queue, amount, file_path='squid.conf'):
    peer_conf = "cache_peer %s parent %s 0 no-query proxy-only never_direct allow all" \
                " round-robin weight=1 connect-fail-limit=2 allow-miss max-conn=5\n"
    with open(file_path, 'r', encoding='utf-8') as f:
        squid_conf = f.readlines()
    squid_conf.append('\n# Cache peer config\n')
    actually_append = 0
    for i in range(amount):
        try:
            ip, port = proxy_queue.get(timeout=10).ip_port().split(':')
        except Empty:
            continue
        actually_append += 1
        squid_conf.append(peer_conf % (ip, port))
    with open('/etc/squid/squid.conf', 'w') as f:
        f.writelines(squid_conf)
    failed = os.system('squid -k reconfigure')
    if failed:
        log_writer('something wrong in squid, reboot squid')
        p = subprocess.Popen(
            "ps -ef | grep squid | grep -v grep  | awk '{print $2}'",
            shell=True,
            stdout=subprocess.PIPE,
            universal_newlines=True)
        p.wait()
        result_lines = [int(x.strip()) for x in p.stdout.readlines()]
        log_writer('found', len(result_lines), 'processes')
        if len(result_lines):
            for proc_id in result_lines:
                log_writer('start to kill proc', proc_id)
                os.system('kill -s 9 {}'.format(proc_id))
            log_writer('squid was killed, start new squid now')
            os.system('service squid restart')
            time.sleep(10)
            log_writer('reloading configure')
            os.system('squid -k reconfigure')
        log_writer(actually_append, 'proxy appended')
コード例 #10
0
ファイル: Scheduler.py プロジェクト: Tsangkam/proxy_pool
def main():
    """
    the entrance of whole system
    :return:
    """
    getter_queue = Queue(maxsize=MAXSIZE_OF_QUEUE)
    appender_queue = Queue(maxsize=MAXSIZE_OF_QUEUE)
    usage_queue = Queue(maxsize=MAXSIZE_OF_QUEUE)
    verify_queue = Queue(maxsize=MAXSIZE_OF_QUEUE)
    pm = ProxyManager.ProxyManager(getter_queue=getter_queue,
                                   appender_queue=appender_queue,
                                   usage_queue=usage_queue,
                                   verify_queue=verify_queue,
                                   host=HOST,
                                   database=DATABASE,
                                   pwd=PASSWORD,
                                   user=USER,
                                   port=PORT,
                                   multiple_timeout=MULTIPLE_TIMEOUT_WHILE_NO_PROXY,
                                   hia_amount=MAX_QUANTITY_OF_HIA_PROXY_SELECT,
                                   rarely_amount=MAX_QUANTITY_OF_RARELY_USED_PROXY_SELECT,
                                   rarely_time=INTERVAL_TIME_OF_RARELY_USED_PROXY)

    vm = VerifyManager.VerifyManager(verify_queue=verify_queue,
                                     feedback_queue=usage_queue,
                                     sleep_time=SLEEPING_TIME_FOR_VERIFY_MANAGER,
                                     interval_time=INTERVAL_TIME_OF_VERIFY)

    cm = CrawlerManager.CrawlerManager(url_getter=Parser.xici_url_construction,
                                       data_parse=Parser.xici_parse,
                                       data_queue=appender_queue,
                                       feedback_queue=usage_queue,
                                       proxy_queue=getter_queue,
                                       maxsize_queue=MAXSIZE_OF_QUEUE,
                                       interval_time=SLEEPING_TIME_FOR_CRAWLER_MANAGER,
                                       base_time=BASE_TIMEOUT_WHILE_WAITING_PROXY,
                                       multiple_time=MULTIPLE_TIMEOUT_WHILE_WAITING_PROXY)
    sm = Process(target=modify_launcher, args=(getter_queue, 20))
    try:
        log_writer('pm start')
        pm.start()  # proxy manager start
        log_writer('vm start')
        vm.start()  # verify manager start
        log_writer('cm start')
        cm.start()  # crawler manager start
        log_writer('squid modifier start')
        sm.start()    # squid modifier start
        # the system controller --only func exit now
        while True:
            order = input()
            if order == 'exit':
                break
    finally:
        if pm.is_alive():
            pm.terminate()
        if vm.is_alive():
            vm.terminate()
        if cm.is_alive():
            cm.terminate()
        if sm.is_alive():
            sm.terminate()
コード例 #11
0
 def terminate(self):
     log_writer("verify manager exit")
     super().terminate()
コード例 #12
0
 def terminate(self):
     log_writer("crawler manager exit")
     super().terminate()
コード例 #13
0
ファイル: ProxyManager.py プロジェクト: Tsangkam/proxy_pool
 def terminate(self):
     log_writer("Proxy Manager exit")
     super().terminate()
コード例 #14
0
ファイル: ProxyManager.py プロジェクト: Tsangkam/proxy_pool
 def usage_listener(self):
     log_writer("usage listener start!")
     while self.__db:
         self.add_usage(self.__usage_queue.get())
コード例 #15
0
ファイル: ProxyManager.py プロジェクト: Tsangkam/proxy_pool
 def appender_listener(self):
     log_writer("appender listener start!")
     while self.__db:
         self.add_proxy(self.__appender_queue.get())