def get_data(self): div_list = self.soup.find('div', 'list') trs = div_list.find_all('tr') data = {} for tr in trs: tds = tr.find_all('td') spans = tds[1].find_all('span') a_list = tds[1].find_all('a') if spans: content = {} for span in spans: a = span.find('a') if a: content[a.text.replace(u' ', '').encode('utf-8')] = a['href'] else: content[span.text.strip().encode('utf-8')] = '' elif a_list: content = {} for a in a_list: try: content[a.text.replace(u' ', '').encode('utf-8')] = a['href'] except KeyError, e: content[a.text.replace(u' ', '').encode('utf-8')] = '' else: content = tds[1].text.encode('utf-8') try: data[self.map_info[tds[0].text]] = content except KeyError, e: logger.info('map has no key:%s' % tds[0].text) raise data[tds[0].text.encode('utf-8')] = content
def reconnect(self): logger.info("adsl reconnect") ppoe_before_adsl.send(self) if not self.get_is_ppoe(): # if is adsling, then skip and wait complete self.request_ppoe() self.ready_for_ppoe() ppoe_after_adsl.send(self)
def init_connection(self): while True: try: self.connection = pika.BlockingConnection(self.parameters) self.channel = self.connection.channel() return except pika.exceptions.ConnectionClosed: logger.info("rabbitmq connect fail, retry.....") time.sleep(2)
def check_error_page(self, response): if response.status_code >= 500 or response.status_code == 404: logger.info("reponse error status code : %s" % response.status_code) if self.is_text_html(response): # only log html content = self.unicode_content(response.content) logger.info(u"reponse error content: %s" % content) raise ResponseError()
def execute(self, task): session = create_session(self) logger.info(task.url) self.task = task response = session.get(task.url) if response.status_code==200: pass else: raise ExitWithoutDone soup = BeautifulSoup(response.content) self.get_list(soup) yield None
def get_data(self): data = {} if isinstance(self.soup, BeautifulSoup): lawyer_info = self.soup.find('table', 'lawyer_info') trs = lawyer_info.find_all('tr') data['name'] = lawyer_info.find('span', id='lawlist_LawerName').text img_url = lawyer_info.find('img', id='lawlist_lsxp').get('src') if img_url == '/static/images/cn/none.jpg': data['img'] = '' else: data['img'] = base64.b64encode( self.session.get(img_url).content) data['E_name'] = lawyer_info.find('span', id='lawlist_cym').text data['gender'] = lawyer_info.find('span', id='lawlist_LawerSex').text data['office'] = lawyer_info.find('span', id='lawlist_Enterprise').text data['pro_type'] = lawyer_info.find('span', id='lawlist_Class').text data['qualification_code'] = lawyer_info.find( 'span', id='lawlist_LawerqualNo').text data['qualification_date'] = lawyer_info.find( 'span', id='lawlist_dtLawerqualNo').text data['licence_code'] = self.task.licence data['licence_date'] = lawyer_info.find('span', id='lawlist_qdzyzsj').text data['work_date'] = lawyer_info.find('span', id='lawlist_zszkszysj').text data['political'] = lawyer_info.find('span', id='lawlist_zzmm').text data['language'] = lawyer_info.find('span', id='lawlist_gzyy').text spans = trs[14].find_all('span') l = [span.text.strip() for span in spans] if ','.join(l) == ',,': data['expert'] = '' else: data['expert'] = ','.join(l) else: data['name'] = self.task.name data['licence_code'] = self.task.licence data['office'] = self.task.office office_id = mongo_db.sz_office.find_one({'name': data['office']}) if office_id: data['office_id'] = str(office_id['_id']) else: data['office_id'] = '' logger.info('Licence:%s' % data['licence_code']) mongo_db.sz_lawyer.update_one({'licence_code': data['licence_code']}, {'$set': data}, upsert=True)
def request(self, *args, **kwargs): if 'timeout' not in kwargs: kwargs['timeout'] = self.timeout max_retry = kwargs.pop('max_retry', None) or self.max_retry content_type = kwargs.pop("content_type", "text/html") read_timeout_times = 0 connection_error_times = 0 response_error_times = 0 for i in range(max_retry): try: before_req_time = time.time() response = super(BaseSession, self).request(*args, **kwargs) response.content_type = content_type span_time = time.time() - before_req_time logger.debug("requst cost time: %s" % span_time) response.cost_time = span_time self.log_response(response) self.check_error_page(response) self.check_ban(response) return response except requests.exceptions.ConnectionError: connection_error_times += 1 logger.debug("request session connection error") self.connection_error() except requests.exceptions.ReadTimeout: read_timeout_times += 1 logger.debug("request session receive data timeout") except requests.exceptions.ChunkedEncodingError: logger.debug("request session chunked encoding error") except BanError: logger.debug("ip has been banned by server") self.on_ban(response) except ResponseError: response_error_times += 1 logger.info("response error page") self.on_error(response) else: logger.debug("request session connect times greater than %s" % max_retry) self.error_times(read_timeout_times, connection_error_times, response_error_times, max_retry) raise ExitWithoutDone()
def run(self): logger.info("engine start running") self.scheduler.queue_declare(self.worker.queue) while True: try: task = self.scheduler.next_task(self.worker.queue) self.current_task = task self.worker.execute_before(task) response_yield = self.worker.execute(task) for response in response_yield: assert isinstance(response, Item) or response is None if isinstance(response, Item): ret = self.worker.pipeline.process_item(response) if ret is not None: task.meta.update(ret) else: task.ack() break self.worker.execute_after(task) except ExitWithoutDone: logger.info("exit without done") task.in_queue() task.ack() self.worker.execute_after(task) except ExitWithDone: logger.info("task finished") task.ack() self.worker.execute_after(task) except ExitWithDoneNoAck: logger.info("exit without done, no ack") self.worker.execute_after(task) except KeyboardInterrupt, error: self.worker.handle_exception(error) self.stop() return except socket.timeout, error: self.worker.handle_exception(error) logger.exception("socket timeout") logger.info("try to reconnect database and rabbitmq") reconnect_database() engine.scheduler.reconnect()
def execute(self, task): self.task = task self.session = create_session(self) if self.task.url: logger.info(self.task.url) response = self.session.get(self.task.url) if response.status_code == 200: pass else: raise ExitWithoutDone soup = BeautifulSoup(response.content) self.soup = soup else: self.soup = '' self.get_data() yield None
def adsl_model(): adsl_signal.send(None) logger.info("ip: %s" % get_real_ip()) HOST = "192.168.2.1" user = '******' password = '******' tn = telnetlib.Telnet(HOST) tn.read_until("login: "******"\n") tn.read_until("Password: "******"\n") kill_pppd = "killall pppd" + "\n" tn.write(kill_pppd) time.sleep(5) # 暂停5s,防止拨号过快,得到相同的ip pppoe = "pppd file /tmp/ppp/options.wan0" + "\n" tn.write(pppoe) tn.write("exit\n") time.sleep(10) # 暂停5s,等待拨号完成 logger.info("after pptp, ip:%s " % get_real_ip())
def get_data(self): user_info = self.soup.find('dl', 'user-info') user_info_extra = self.soup.find('div', id='detail01') data = {} data['name'] = user_info.find('dd', 'name').text.strip() data['office'] = user_info.find('a').text.strip() data['office_url'] = user_info.find('a').get('href') spans = user_info.find('dd', 'tag').find_all('span') l = [span.text.strip() for span in spans] if ','.join(l) == ',,': data['expert'] = '' else: data['expert'] = ','.join(l) img_url = user_info.find('img').get('src') data['img'] = base64.b64encode(self.session.get(img_url).content) lis = user_info_extra.find_all('li') for li in lis: key, value = li.text.split(u':', 1) data[self.map_info[key]] = value logger.info('Licence:%s' % data['licence_code']) mongo_db.sh_lawyer.update_one({'licence_code': data['licence_code']}, {'$set': data}, upsert=True)
def reconnect(self): if self.is_pptp: logger.info(u"检测到正在拨号") if not self.is_pptp: logger.info(u"发送拨号信号") self.redis.set("pptp", 1) self.wait_pptp_complete(self.last_ip) logger.info("vpn ip change from %s: to %s" % (self.last_ip, self.now_pptp_ip)) self.update_vpn_ip()
def pptp(): logger.info("ip: %s" % get_real_ip()) os.system("bash pptp.sh") logger.info("after pptp, ip:%s " % get_real_ip())
def send_list_to_queue(self, queue, body): logger.info('%s: %s' % (queue,body)) self.task.back_to_queue(queue=queue, body=body)
def stop(self): logger.info("engine is stopping") engine_stop.send(self)
def close_database(sender): logger.info("close database") mongo_db.client.close()
class Engine(object): def start(self, settings): self.settings = settings self.scheduler = Scheduler.from_settings(settings) self.workers_cls = self.load_all_wokers() self.current_task = None # self.setup_log() engine_setup.send(self) def setup_log(self): LOGGING['handlers']['info_file'][ 'filename'] = "log/%s.log" % self.worker_cls.name logging.config.dictConfig(LOGGING) def load_all_wokers(self): return get_class_from_module(self.settings.workers_dir, Worker) def create_worker(self, name): if name not in self.workers_cls: raise Exception(u"no worker with name: %s" % name) worker_cls = self.workers_cls[name] self.worker = worker_cls(self) def run(self): logger.info("engine start running") self.scheduler.queue_declare(self.worker.queue) while True: try: task = self.scheduler.next_task(self.worker.queue) self.current_task = task self.worker.execute_before(task) response_yield = self.worker.execute(task) for response in response_yield: assert isinstance(response, Item) or response is None if isinstance(response, Item): ret = self.worker.pipeline.process_item(response) if ret is not None: task.meta.update(ret) else: task.ack() break self.worker.execute_after(task) except ExitWithoutDone: logger.info("exit without done") task.in_queue() task.ack() self.worker.execute_after(task) except ExitWithDone: logger.info("task finished") task.ack() self.worker.execute_after(task) except ExitWithDoneNoAck: logger.info("exit without done, no ack") self.worker.execute_after(task) except KeyboardInterrupt, error: self.worker.handle_exception(error) self.stop() return except socket.timeout, error: self.worker.handle_exception(error) logger.exception("socket timeout") logger.info("try to reconnect database and rabbitmq") reconnect_database() engine.scheduler.reconnect() except Exception, error: ret = self.worker.handle_exception(error) if not ret: client.captureException() logger.info("worker not handler") logger.exception("uncaught exception") self.stop() raise return
def ready_for_ppoe(self): self.send_ready_signal() logger.info("wait ppoe complete") self.wait_ppoe_complete() self.reset_ready_signal()
def close_rabbitmq(sender): logger.info("close rabbbitmq") engine.scheduler.close()
def request_ppoe(self): logger.info("request adsl") self.redis.set(REQUEST_PPOE_SIGNAL, 1)
def send_ready_signal(self): logger.info("send ready signal") self.redis.hset(REGISTER_PIDS, self.get_pid(), READY)
def reconnect(self): # self.connection = pika.BlockingConnection(self.parameters) # self.channel = self.connection.channel() self.init_connection() logger.info("rabbitmq connection open state: %s" % self.connection.is_open)
def check_notify_ppoe_signal(self): is_will_ppoe = ppoe_client.get_notify_ppoe_signal() if is_will_ppoe: logger.info("check ppoe will reconnect") self.adsl_reconnect() raise ExitWithDoneNoAck()
def start(self): logger.info("vpn ip: %s" % self.now_pptp_ip) self.update_vpn_ip()
def open_rabbitmq(sender): logger.info("rabbitmq connect") engine.scheduler.reconnect()
def close(self): logger.info(u"正在关闭rabbitmq") # self.channel.close() self.connection.close()
def on_ban(self, response): logger.info("ip has been banned") self.adsl_reconnect() raise ExitWithDoneNoAck()