def run(self): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) ' 'AppleWebKit/537.36 (KHTML, like Gecko)' 'Chrome/52.0.2743.116 Safari/537.36' } if self.proxy: proxies = {"https": "https://" + self.proxy.ip_port()} else: proxies = None # log_writer(proxies) try: self.__start_time = time.time() # log_writer('start to get', self.url) self.session = requests.get(self.url, headers=headers, proxies=proxies, timeout=self.timeout) finish_time = time.time() self.delta_time = round(finish_time - self.__start_time, 3) if self.session.status_code != 200: self.delta_time = -1 except requests.exceptions.Timeout: self.delta_time = -1 except requests.exceptions.ProxyError: self.delta_time = -1 except requests.exceptions.ConnectionError: self.delta_time = -1 except Exception as e: log_writer(type(e)) log_writer(e)
def run(self): """ The entrance of the process. This process has three threads: 1.u_listener: used to listen for urls to crawl, and start the crawler 2.th_rmd: used to searching the completed crawler 3.h_listener: used to listen for html to parse """ self.url_queue = Queue(maxsize=self.maxsize_queue) self.html_queue = Queue(maxsize=self.maxsize_queue) u_listener = Thread(target=self.url_listener, name='crawler_creator') u_listener.start() log_writer('url listener start') th_rmd = Thread(target=self.remove_death, name='crawler_sweeper') th_rmd.start() log_writer('spider sweeper start') h_listener = Thread(target=self.html_listener, name='parse_listener') h_listener.start() log_writer('parse listener start') while True: log_writer('Crawler Manager wakeup') for u in self.url_getter(): while self.url_queue.qsize() + 1 >= self.maxsize_queue: # log_writer('current url:', self.url_queue.qsize()) continue self.url_queue.put(u) log_writer('Crawler Manager going to sleep', self.interval_time, 'sec') time.sleep(self.interval_time)
def verify_control(self): log_writer("verify control start") while self.__db: proxies_info = self.get_rarely_proxy() for proxy_info in proxies_info: self.__verify_queue.put( Proxy(proxy_info[0], proxy_info[1], proxy_info[2]))
def remove_death(self): global SPIDER_Semaphore while True: for v in self.verify_pool: if not v.is_alive(): self.__feedback_queue.put(v.feedback()) log_writer(v.proxy.id, v.delta_time) self.verify_pool.remove(v) SPIDER_Semaphore.release() time.sleep(1)
def url_listener(self): """ used to listen the url to crawler, and start a crawler as a thread """ start_time = time.time() while True: SPIDER_Semaphore.acquire() # if there is not useful proxy in the proxy queue, use local ip try: proxy = self.__proxy_queue.get( timeout=self.base_time + self.multiple_time * self.time_of_using_local_ip) except Empty: proxy = None self.time_of_using_local_ip += 1 log_writer("using local ip", self.time_of_using_local_ip) c = Crawler(url=self.url_queue.get(), proxy=proxy) c.start() self.crawler_pool.append(c)
def remove_death(self): global SPIDER_Semaphore while True: for c in self.crawler_pool: if not c.is_alive(): # checking the complete crawler SPIDER_Semaphore.release() if not c.feedback( ) is None: # feedback while not local ip address is used self.__feedback_queue.put(c.feedback()) if c.delta_time <= 0: # if the crawler failed, put the url into the url queue again self.url_queue.put(c.url) log_writer(c.url, c.proxy.id if c.proxy else 'local ip', 'failed') else: # else put the html text into the html queue self.html_queue.put(c.session.text) log_writer(c.url, c.proxy.id if c.proxy else 'local ip', 'succeed') self.crawler_pool.remove(c)
def run(self): global SPIDER_Semaphore rmd = Thread(target=self.remove_death) rmd.start() start_time = time.time() while True: for url in TEST_URL: SPIDER_Semaphore.acquire() try: v = CrawlerManager.Crawler(url=url, proxy=self.__verify_queue.get(timeout=self.interval_time/10)) v.start() self.verify_pool.append(v) except Empty: continue finally: end_time = time.time() if end_time - start_time >= self.interval_time: log_writer("verify manager sleeping", self.sleep_time, 'sec') time.sleep(self.sleep_time) start_time = time.time() log_writer("verify manager wakeup")
def getter_filler(self): log_writer("filler start") fail_times = 0 while self.__db: proxies_info = self.fill_getter() log_writer('put', len(proxies_info), 'proxies') if not proxies_info: fail_times += 1 log_writer("no suit proxy", fail_times) time.sleep(self.multiple_timeout * fail_times) continue for proxy_info in proxies_info: self.__getter_queue.put( Proxy(proxy_info[0], proxy_info[1], proxy_info[2]))
def squid_modify(proxy_queue, amount, file_path='squid.conf'): peer_conf = "cache_peer %s parent %s 0 no-query proxy-only never_direct allow all" \ " round-robin weight=1 connect-fail-limit=2 allow-miss max-conn=5\n" with open(file_path, 'r', encoding='utf-8') as f: squid_conf = f.readlines() squid_conf.append('\n# Cache peer config\n') actually_append = 0 for i in range(amount): try: ip, port = proxy_queue.get(timeout=10).ip_port().split(':') except Empty: continue actually_append += 1 squid_conf.append(peer_conf % (ip, port)) with open('/etc/squid/squid.conf', 'w') as f: f.writelines(squid_conf) failed = os.system('squid -k reconfigure') if failed: log_writer('something wrong in squid, reboot squid') p = subprocess.Popen( "ps -ef | grep squid | grep -v grep | awk '{print $2}'", shell=True, stdout=subprocess.PIPE, universal_newlines=True) p.wait() result_lines = [int(x.strip()) for x in p.stdout.readlines()] log_writer('found', len(result_lines), 'processes') if len(result_lines): for proc_id in result_lines: log_writer('start to kill proc', proc_id) os.system('kill -s 9 {}'.format(proc_id)) log_writer('squid was killed, start new squid now') os.system('service squid restart') time.sleep(10) log_writer('reloading configure') os.system('squid -k reconfigure') log_writer(actually_append, 'proxy appended')
def main(): """ the entrance of whole system :return: """ getter_queue = Queue(maxsize=MAXSIZE_OF_QUEUE) appender_queue = Queue(maxsize=MAXSIZE_OF_QUEUE) usage_queue = Queue(maxsize=MAXSIZE_OF_QUEUE) verify_queue = Queue(maxsize=MAXSIZE_OF_QUEUE) pm = ProxyManager.ProxyManager(getter_queue=getter_queue, appender_queue=appender_queue, usage_queue=usage_queue, verify_queue=verify_queue, host=HOST, database=DATABASE, pwd=PASSWORD, user=USER, port=PORT, multiple_timeout=MULTIPLE_TIMEOUT_WHILE_NO_PROXY, hia_amount=MAX_QUANTITY_OF_HIA_PROXY_SELECT, rarely_amount=MAX_QUANTITY_OF_RARELY_USED_PROXY_SELECT, rarely_time=INTERVAL_TIME_OF_RARELY_USED_PROXY) vm = VerifyManager.VerifyManager(verify_queue=verify_queue, feedback_queue=usage_queue, sleep_time=SLEEPING_TIME_FOR_VERIFY_MANAGER, interval_time=INTERVAL_TIME_OF_VERIFY) cm = CrawlerManager.CrawlerManager(url_getter=Parser.xici_url_construction, data_parse=Parser.xici_parse, data_queue=appender_queue, feedback_queue=usage_queue, proxy_queue=getter_queue, maxsize_queue=MAXSIZE_OF_QUEUE, interval_time=SLEEPING_TIME_FOR_CRAWLER_MANAGER, base_time=BASE_TIMEOUT_WHILE_WAITING_PROXY, multiple_time=MULTIPLE_TIMEOUT_WHILE_WAITING_PROXY) sm = Process(target=modify_launcher, args=(getter_queue, 20)) try: log_writer('pm start') pm.start() # proxy manager start log_writer('vm start') vm.start() # verify manager start log_writer('cm start') cm.start() # crawler manager start log_writer('squid modifier start') sm.start() # squid modifier start # the system controller --only func exit now while True: order = input() if order == 'exit': break finally: if pm.is_alive(): pm.terminate() if vm.is_alive(): vm.terminate() if cm.is_alive(): cm.terminate() if sm.is_alive(): sm.terminate()
def terminate(self): log_writer("verify manager exit") super().terminate()
def terminate(self): log_writer("crawler manager exit") super().terminate()
def terminate(self): log_writer("Proxy Manager exit") super().terminate()
def usage_listener(self): log_writer("usage listener start!") while self.__db: self.add_usage(self.__usage_queue.get())
def appender_listener(self): log_writer("appender listener start!") while self.__db: self.add_proxy(self.__appender_queue.get())