def __init__(self, host, hostport, mongo, mongoport, redis, redisport, web): self.status = "RUNNING" self.client_id = None self.enqued_num = 0 self.start = False self.get = False self.socket = SocketClient(host, hostport) self.deque = deque() self.dbmanager = MongoRedisUrlManager(mongo, mongoport) self.dbmanager.enqueuUrl(web, 'new', 0) self.dir_name = 'web/' self.max_num_thread = 5 self.CRAWL_DELAY = 5 self.last_heartbeat_time = time.time() self.web = web self.client_id = None self.request_headers = { 'host': "www.mafengwo.cn", 'connection': "keep-alive", 'cache-control': "no-cache", 'upgrade-insecure-requests': "1", 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36", 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6" } #start a thread for heartbeat which is continual if not os.path.exists(self.dir_name): os.mkdir(self.dir_name) try: t = threading.Thread(target=self.heartbeat, args=()) t.setDaemon(True) t.start() except Exception as err: print("failed to start thread HEARTBEAT ,error is " + str(err)) time.sleep(30) if self.get: self.thread()
def getcontent(): global j while j < 100000: try: url = 'http://www.creditbj.gov.cn/xyData/front/creditService/getPageList.shtml?pageNo=%s&keyword=&typeId=19' % str( j) data = requests.get(url=url, headers=headers) a = (json.loads(data.text)) db = MongoRedisUrlManager() for i in a['hits']['hits']: print('开始注入数据', j) #print(i['_source']) db.db.credit.insert(i['_source']) except: pass j += 1
#webdriver.DesiredCapabilities.PHANTOMJS['phantomjs.page.settings.userAgent'] ='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36' re_compiled_obj = re.compile('\d+') constants = { 'MAX_PAGE_TRIED': 2, 'HB_PERIOD': 5, 'MAX_SIZE_THREADPOOL': 5, 'CRAWL_DELAY': 2 } # Initialize system variables #dir_name = 'mfw/' # db manager webdrivers = {} dbmanager = MongoRedisUrlManager() is_root_page = True threads = [] options = webdriver.ChromeOptions() # 设置中文 options.add_argument('lang=zh_CN.UTF-8') prefs = {"profile.managed_default_content_settings.images": 2} options.add_experimental_option("prefs", prefs) # use hdfs to save pages # hdfs_client = InsecureClient('http://54.223.92.169:50070', user='******') socket_client = SocketClient('localhost', 20012) client_id = 0 hb_period = 5
class crawler: def __init__(self, host, hostport, mongo, mongoport, redis, redisport, web): self.status = "RUNNING" self.client_id = None self.enqued_num = 0 self.start = False self.get = False self.socket = SocketClient(host, hostport) self.deque = deque() self.dbmanager = MongoRedisUrlManager(mongo, mongoport) self.dbmanager.enqueuUrl(web, 'new', 0) self.dir_name = 'web/' self.max_num_thread = 5 self.CRAWL_DELAY = 5 self.last_heartbeat_time = time.time() self.web = web self.client_id = None self.request_headers = { 'host': "www.mafengwo.cn", 'connection': "keep-alive", 'cache-control': "no-cache", 'upgrade-insecure-requests': "1", 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36", 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6" } #start a thread for heartbeat which is continual if not os.path.exists(self.dir_name): os.mkdir(self.dir_name) try: t = threading.Thread(target=self.heartbeat, args=()) t.setDaemon(True) t.start() except Exception as err: print("failed to start thread HEARTBEAT ,error is " + str(err)) time.sleep(30) if self.get: self.thread() #start multi_process num is equal to cpu_max #facing error not solved # def multi_process(self,cpu_max): # print("started multi_process") # process=[] # for i in range(cpu_max): # p=multiprocessing.Process(target=self.thread, name=None,daemon=True) # process.append(p) # for p in process: # p.start() # p.join() def thread(self): print("starting thread") threads = [] while True: if self.status == "RUNNING": # first remove all finished running threads for t in threads: if not t.is_alive(): threads.remove(t) if len(threads) >= self.max_num_thread: time.sleep(self.CRAWL_DELAY) continue try: try: curtask = self.deque.pop() self.deque.append(curtask) except: curtask = None if self.status == "RUNNING" and curtask is not None: curtask = self.deque.pop() t = threading.Thread(target=self.get_page_content, name=None, args=(curtask[0], curtask[1])) threads.append(t) t.setDaemon(True) t.start() continue else: continue # set daemon so main thread can exit when receives ctrl-c except Exception as err: print("Error: unable to start thread" + str(err)) else: continue def get_page_content(self, cur_url, depth): print("downloading %s at level %d" % (cur_url, depth)) links = [] try: req = requests.request('GET', cur_url, headers=self.request_headers) req.encoding = req.apparent_encoding html_page = req.text filename = cur_url[7:].replace('/', '_') # Write page to local files system fo = open("%s%s.html" % (self.dir_name, filename), 'wb+') fo.write(html_page.encode("utf-8")) fo.close() self.dbmanager.finishUrl(cur_url) except Exception as err: print(err) return html = etree.HTML(html_page.lower()) hrefs = html.xpath(u"//a") for href in hrefs: try: if 'href' in href.attrib: val = href.attrib['href'] if val.find('javascript:') != -1: continue if val.startswith('http://') is False: if val.startswith('/'): val = self.web + val else: continue if val[-1] == '/': val = val[0:-1] links.append(val) while not (self.status == "RUNNING"): time.sleep(5) self.dbmanager.enqueuUrl(val, 'new', depth + 1) self.enqued_num += 1 except ValueError: continue self.dbmanager.set_url_links(cur_url, links) def on_massage(self, server_response): request = server_response try: massage = request["MASSAGE"] if massage == "PAUSE" and self.status == "RUNNING": self.status = "PAUSED" print("receiving data {} from server".format(request)) elif massage == "RESUME" and self.status == "PAUSED": self.status = "RUNNING" print("receiving data {} from server".format(request)) elif massage == "WAIT" and self.status == "PAUSED": self.status = "PAUSED" elif massage == "REGISTERED" and self.client_id is None: self.client_id = request["CLIENT_ID"] print("receiving data {} from server".format(request)) else: print("MASSAGE_TYPE is invalid") except: fetch_num = len(request["URLS"]) if fetch_num > 0: for i in request["URLS"]: self.deque.append(i) self.get = True print("receiving data {} from server".format(request)) elif request == {}: print("normal HEARTBEAT response received") else: print("invalid request from server") return None def heartbeat(self): request = {} #register client in server if self.client_id is None: request["MASSAGE_TYPE"] = "REGISTER" request["CLIENT_STATUS"] = self.status server_response = self.socket.send(json.dumps(request)) self.on_massage(server_response) self.last_heartbeat_time = time.time() #use while infinite loop to send massages like heartbeat and fetch_url while True: #if deque is empty ,URL_REQUEST is prior to HEARTBEAT ,and as a substitution if self.enqued_num > 30: self.start = False if self.start == True: continue try: a1 = self.deque.pop() self.deque.append(a1) except: a1 = None time1 = time.time() if a1 is None and time1 - self.last_heartbeat_time >= 40: request["MASSAGE_TYPE"] = "URL_REQUEST" request["CLIENT_STATUS"] = self.status request["CLIENT_ID"] = self.client_id server_response = self.socket.send(json.dumps(request)) self.on_massage(server_response) self.last_heartbeat_time = time.time() elif time1 - self.last_heartbeat_time >= 40: request["MASSAGE_TYPE"] = "HEARTBEAT" request["CLIENT_STATUS"] = self.status request["CLIENT_ID"] = self.client_id server_response = self.socket.send(json.dumps(request)) self.on_massage(server_response) self.last_heartbeat_time = time.time() elif a1 is None: request["MASSAGE_TYPE"] = "URL_REQUEST" request["CLIENT_STATUS"] = self.status request["CLIENT_ID"] = self.client_id server_response = self.socket.send(json.dumps(request)) self.on_massage(server_response) self.last_heartbeat_time = time.time() self.start = True else: continue
if val.find('javascript:') != -1: continue if val.startswith('http://') is False: if val.startswith('/'): val = 'http://www.mafengwo.cn' + val else: continue if val[-1] == '/': val = val[0:-1] dbmanager.enqueueUrl(val, 'new', depth + 1) except ValueError: continue max_num_thread = 5 dbmanager = MongoRedisUrlManager() dbmanager.enqueueUrl("http://www.mafengwo.cn", 'new', 0) start_time = time.time() is_root_page = True threads = [] CRAWL_DELAY = 0.6 # use hdfs to save pages # hdfs_client = InsecureClient('http://54.223.92.169:50070', user='******') while True: curtask = dbmanager.dequeueUrl() print curtask
webdriver.DesiredCapabilities.PHANTOMJS[ 'phantomjs.page.customHeaders.{}'.format(key)] = value # another way to set custome header webdriver.DesiredCapabilities.PHANTOMJS['phantomjs.page.settings.userAgent'] = \ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36' constants = { 'MAX_PAGE_TRIED': 2, 'HB_PERIOD': 5, 'MAX_SIZE_THREADPOOL': 3, 'CRAWL_DELAY': 2 } # db manager dbmanager = MongoRedisUrlManager() start_time = time.time() threads = {} webdrivers = {} socket_client = SocketClient('localhost', 20100) register_request = {} register_request[pc.MSG_TYPE] = pc.REGISTER client_id = socket_client.send(json.dumps(register_request)) run_heartbeat = True server_status = pc.STATUS_RUNNING re_compiled_obj = re.compile('\d{7}')
class CrawlMaster(object): # 客户端注册表,{'client_id': {'time':'xx', 'status':'xx'}} clients = {} server_status = pc.STATUS_RUNNING last_rereoder_time = time.time() dbmanager = MongoRedisUrlManager() def __init__(self, mongo_client=None, mongo_host='127.0.0.1'): self.server = ServerSocket(self.on_message) self.server.start() def on_message(self, msg): #msg 是client发送过来的心跳信息 request = json.loads(msg) type = request[pc.MSG_TYPE] client_state = {} response = {} response[pc.SERVER_STATUS] = self.server_status if type == pc.REGISTER: client_id = self.get_free_id() client_state['status'] = pc.STATUS_RUNNING client_state['time'] = time.time() self.clients[client_id] = client_state return client_id elif type == pc.UNREGISTER: client_id = request.get(pc.CLIENT_ID) del self.clients[client_id] return json.dumps(response) elif type == pc.LOCATIONS: crawl_urls = self.dbmanager.dequeueUrls(size=pc.REQUEST_SIZE) print(crawl_urls) response[pc.MSG_TYPE] = pc.LOCATIONS response[pc.CRAWL_DELAY] = pc.CRAWL_DELAY_TIME response[pc.DATA] = crawl_urls self.flash_hbtime(request) return json.dumps(response) elif type == pc.TRIPLES: crawl_urls = self.dbmanager.dequeueUrls(request[pc.REQUEST_SIZE]) response[pc.MSG_TYPE] = pc.LOCATIONS response[pc.DATA] = crawl_urls self.flash_hbtime(request) return json.dumps(response) elif type == pc.FINISHED_ITEMS: # new urls from client save to db by master save_urls = request.get(pc.FINISHED_ITEMS) self.dbmanager.enqueueUrls(save_urls) self.flash_hbtime(request) return json.dumps(response) client_id = request.get(pc.CLIENT_ID) if client_id is None: response[pc.ERROR] = pc.ERR_NOT_FOUND return json.dumps(response) if type == pc.HEARTBEAT: if self.server_status is not self.clients[client_id]['status']: if self.server_status == pc.STATUS_RUNNING: response[pc.ACTION_REQUIRED] = pc.RESUME_REQUIRED elif self.server_status == pc.STATUS_PAUSED: response[pc.ACTION_REQUIRED] = pc.PAUSE_REQUIRED elif self.server_status == pc.STATUS_SHUTDOWN: response[pc.ACTION_REQUIRED] = pc.SHUTDOWN_REQUIRED return json.dumps(response) else: # a normal heart beat self.flash_hbtime(request) return json.dumps(response) else: if type == pc.PAUSED: client_state['status'] = pc.STATUS_PAUSED elif type == pc.RESUMED: client_state['status'] = pc.STATUS_RUNNING client_state['time'] = time.time() #flash hb time self.clients[client_id] = client_state return json.dumps(response) def periodical_check(self): # check heart beat # clients_status_ok = True while True: lost_cid = [] for cid, state in self.clients.items(): if time.time() - state['time'] > constants['connection_lost_period']: # del self.clients[cid] -> reason:dictionary changed size during iteration self.clients[cid]['status'] = pc.STATUS_CONNECTION_LOST lost_cid.append(cid) continue for cid in lost_cid: if self.clients[cid]['status'] != self.server_status: # remove if from client list del self.clients[cid] time.sleep(PERIODICAL_CHECK_TIME) def get_free_id(self): i = 0 for key in self.clients: if i < int(key): break i += 1 return str(i) def flash_hbtime(self, request): client_id = request.get(pc.CLIENT_ID) self.clients[client_id]['time'] = time.time()
pass except Exception as err: print("get_page_content()", err) pass try: items = re.findall('//www.lagou.com/jobs/\d+.html', r.data.decode('utf-8')) links = [] for i in items: fullurl = 'https:' + i #print(fullurl) db.enqueueUrl(fullurl, 'new') links.append(fullurl) print(links) except: pass def crawl(): while True: #print(cur_queue) #url = dequeuUrl() url = db.dequeueUrl()['url'] get_page_content(url) if __name__ == "__main__": db = MongoRedisUrlManager() db.clear() db.enqueueUrl('https://www.lagou.com', 'new') crawl()