예제 #1
0
 def get_data(self):
     div_list = self.soup.find('div', 'list')
     trs = div_list.find_all('tr')
     data = {}
     for tr in trs:
         tds = tr.find_all('td')
         spans = tds[1].find_all('span')
         a_list = tds[1].find_all('a')
         if spans:
             content = {}
             for span in spans:
                 a = span.find('a')
                 if a:
                     content[a.text.replace(u' ',
                                            '').encode('utf-8')] = a['href']
                 else:
                     content[span.text.strip().encode('utf-8')] = ''
         elif a_list:
             content = {}
             for a in a_list:
                 try:
                     content[a.text.replace(u' ',
                                            '').encode('utf-8')] = a['href']
                 except KeyError, e:
                     content[a.text.replace(u' ', '').encode('utf-8')] = ''
         else:
             content = tds[1].text.encode('utf-8')
         try:
             data[self.map_info[tds[0].text]] = content
         except KeyError, e:
             logger.info('map has no key:%s' % tds[0].text)
             raise
             data[tds[0].text.encode('utf-8')] = content
예제 #2
0
파일: client.py 프로젝트: pijiupapa/crawler
 def reconnect(self):
     logger.info("adsl reconnect")
     ppoe_before_adsl.send(self)
     if not self.get_is_ppoe():
         # if is adsling, then skip and wait complete
         self.request_ppoe()
     self.ready_for_ppoe()
     ppoe_after_adsl.send(self)
예제 #3
0
 def init_connection(self):
     while True:
         try:
             self.connection = pika.BlockingConnection(self.parameters)
             self.channel = self.connection.channel()
             return
         except pika.exceptions.ConnectionClosed:
             logger.info("rabbitmq connect fail, retry.....")
             time.sleep(2)
예제 #4
0
    def check_error_page(self, response):
        if response.status_code >= 500 or response.status_code == 404:
            logger.info("reponse error status code : %s" %
                        response.status_code)

            if self.is_text_html(response):
                # only log html
                content = self.unicode_content(response.content)
                logger.info(u"reponse error content: %s" % content)
            raise ResponseError()
예제 #5
0
 def execute(self, task):
     session = create_session(self)
     logger.info(task.url)
     self.task = task
     response = session.get(task.url)
     if response.status_code==200:
         pass
     else:
         raise ExitWithoutDone
     soup = BeautifulSoup(response.content)
     self.get_list(soup)
     yield None
예제 #6
0
 def get_data(self):
     data = {}
     if isinstance(self.soup, BeautifulSoup):
         lawyer_info = self.soup.find('table', 'lawyer_info')
         trs = lawyer_info.find_all('tr')
         data['name'] = lawyer_info.find('span',
                                         id='lawlist_LawerName').text
         img_url = lawyer_info.find('img', id='lawlist_lsxp').get('src')
         if img_url == '/static/images/cn/none.jpg':
             data['img'] = ''
         else:
             data['img'] = base64.b64encode(
                 self.session.get(img_url).content)
         data['E_name'] = lawyer_info.find('span', id='lawlist_cym').text
         data['gender'] = lawyer_info.find('span',
                                           id='lawlist_LawerSex').text
         data['office'] = lawyer_info.find('span',
                                           id='lawlist_Enterprise').text
         data['pro_type'] = lawyer_info.find('span',
                                             id='lawlist_Class').text
         data['qualification_code'] = lawyer_info.find(
             'span', id='lawlist_LawerqualNo').text
         data['qualification_date'] = lawyer_info.find(
             'span', id='lawlist_dtLawerqualNo').text
         data['licence_code'] = self.task.licence
         data['licence_date'] = lawyer_info.find('span',
                                                 id='lawlist_qdzyzsj').text
         data['work_date'] = lawyer_info.find('span',
                                              id='lawlist_zszkszysj').text
         data['political'] = lawyer_info.find('span',
                                              id='lawlist_zzmm').text
         data['language'] = lawyer_info.find('span', id='lawlist_gzyy').text
         spans = trs[14].find_all('span')
         l = [span.text.strip() for span in spans]
         if ','.join(l) == ',,':
             data['expert'] = ''
         else:
             data['expert'] = ','.join(l)
     else:
         data['name'] = self.task.name
         data['licence_code'] = self.task.licence
         data['office'] = self.task.office
     office_id = mongo_db.sz_office.find_one({'name': data['office']})
     if office_id:
         data['office_id'] = str(office_id['_id'])
     else:
         data['office_id'] = ''
     logger.info('Licence:%s' % data['licence_code'])
     mongo_db.sz_lawyer.update_one({'licence_code': data['licence_code']},
                                   {'$set': data},
                                   upsert=True)
예제 #7
0
    def request(self, *args, **kwargs):
        if 'timeout' not in kwargs:
            kwargs['timeout'] = self.timeout
        max_retry = kwargs.pop('max_retry', None) or self.max_retry
        content_type = kwargs.pop("content_type", "text/html")

        read_timeout_times = 0
        connection_error_times = 0
        response_error_times = 0
        for i in range(max_retry):
            try:
                before_req_time = time.time()
                response = super(BaseSession, self).request(*args, **kwargs)
                response.content_type = content_type
                span_time = time.time() - before_req_time
                logger.debug("requst cost time: %s" % span_time)
                response.cost_time = span_time
                self.log_response(response)
                self.check_error_page(response)
                self.check_ban(response)
                return response

            except requests.exceptions.ConnectionError:
                connection_error_times += 1
                logger.debug("request session connection error")
                self.connection_error()

            except requests.exceptions.ReadTimeout:
                read_timeout_times += 1
                logger.debug("request session receive data timeout")

            except requests.exceptions.ChunkedEncodingError:
                logger.debug("request session chunked encoding error")

            except BanError:
                logger.debug("ip has been banned by server")
                self.on_ban(response)

            except ResponseError:
                response_error_times += 1
                logger.info("response error page")
                self.on_error(response)

        else:
            logger.debug("request session connect times greater than %s" %
                         max_retry)
            self.error_times(read_timeout_times, connection_error_times,
                             response_error_times, max_retry)
            raise ExitWithoutDone()
예제 #8
0
파일: engine.py 프로젝트: pijiupapa/crawler
    def run(self):
        logger.info("engine start running")
        self.scheduler.queue_declare(self.worker.queue)
        while True:
            try:
                task = self.scheduler.next_task(self.worker.queue)
                self.current_task = task
                self.worker.execute_before(task)
                response_yield = self.worker.execute(task)
                for response in response_yield:
                    assert isinstance(response, Item) or response is None

                    if isinstance(response, Item):
                        ret = self.worker.pipeline.process_item(response)
                        if ret is not None:
                            task.meta.update(ret)
                    else:
                        task.ack()
                        break

                self.worker.execute_after(task)

            except ExitWithoutDone:
                logger.info("exit without done")
                task.in_queue()
                task.ack()
                self.worker.execute_after(task)

            except ExitWithDone:
                logger.info("task finished")
                task.ack()
                self.worker.execute_after(task)

            except ExitWithDoneNoAck:
                logger.info("exit without done, no ack")
                self.worker.execute_after(task)

            except KeyboardInterrupt, error:
                self.worker.handle_exception(error)
                self.stop()
                return

            except socket.timeout, error:
                self.worker.handle_exception(error)
                logger.exception("socket timeout")
                logger.info("try to reconnect database and rabbitmq")
                reconnect_database()
                engine.scheduler.reconnect()
예제 #9
0
 def execute(self, task):
     self.task = task
     self.session = create_session(self)
     if self.task.url:
         logger.info(self.task.url)
         response = self.session.get(self.task.url)
         if response.status_code == 200:
             pass
         else:
             raise ExitWithoutDone
         soup = BeautifulSoup(response.content)
         self.soup = soup
     else:
         self.soup = ''
     self.get_data()
     yield None
예제 #10
0
def adsl_model():
    adsl_signal.send(None)
    logger.info("ip: %s" % get_real_ip())
    HOST = "192.168.2.1"
    user = '******'
    password = '******'
    tn = telnetlib.Telnet(HOST)

    tn.read_until("login: "******"\n")
    tn.read_until("Password: "******"\n")

    kill_pppd = "killall pppd" + "\n"
    tn.write(kill_pppd)
    time.sleep(5)  # 暂停5s,防止拨号过快,得到相同的ip
    pppoe = "pppd file /tmp/ppp/options.wan0" + "\n"
    tn.write(pppoe)
    tn.write("exit\n")

    time.sleep(10)  # 暂停5s,等待拨号完成
    logger.info("after pptp, ip:%s " % get_real_ip())
예제 #11
0
 def get_data(self):
     user_info = self.soup.find('dl', 'user-info')
     user_info_extra = self.soup.find('div', id='detail01')
     data = {}
     data['name'] = user_info.find('dd', 'name').text.strip()
     data['office'] = user_info.find('a').text.strip()
     data['office_url'] = user_info.find('a').get('href')
     spans = user_info.find('dd', 'tag').find_all('span')
     l = [span.text.strip() for span in spans]
     if ','.join(l) == ',,':
         data['expert'] = ''
     else:
         data['expert'] = ','.join(l)
     img_url = user_info.find('img').get('src')
     data['img'] = base64.b64encode(self.session.get(img_url).content)
     lis = user_info_extra.find_all('li')
     for li in lis:
         key, value = li.text.split(u':', 1)
         data[self.map_info[key]] = value
     logger.info('Licence:%s' % data['licence_code'])
     mongo_db.sh_lawyer.update_one({'licence_code': data['licence_code']},
                                   {'$set': data},
                                   upsert=True)
예제 #12
0
 def reconnect(self):
     if self.is_pptp:
         logger.info(u"检测到正在拨号")
     if not self.is_pptp:
         logger.info(u"发送拨号信号")
         self.redis.set("pptp", 1)
     self.wait_pptp_complete(self.last_ip)
     logger.info("vpn ip change from %s: to %s" %
                 (self.last_ip, self.now_pptp_ip))
     self.update_vpn_ip()
예제 #13
0
def pptp():
    logger.info("ip: %s" % get_real_ip())
    os.system("bash pptp.sh")
    logger.info("after pptp, ip:%s " % get_real_ip())
예제 #14
0
 def send_list_to_queue(self, queue, body):
     logger.info('%s: %s' % (queue,body))
     self.task.back_to_queue(queue=queue, body=body)
예제 #15
0
파일: engine.py 프로젝트: pijiupapa/crawler
 def stop(self):
     logger.info("engine is stopping")
     engine_stop.send(self)
예제 #16
0
파일: main.py 프로젝트: pijiupapa/crawler
def close_database(sender):
    logger.info("close database")
    mongo_db.client.close()
예제 #17
0
파일: engine.py 프로젝트: pijiupapa/crawler
class Engine(object):
    def start(self, settings):
        self.settings = settings
        self.scheduler = Scheduler.from_settings(settings)
        self.workers_cls = self.load_all_wokers()
        self.current_task = None
        # self.setup_log()
        engine_setup.send(self)

    def setup_log(self):
        LOGGING['handlers']['info_file'][
            'filename'] = "log/%s.log" % self.worker_cls.name
        logging.config.dictConfig(LOGGING)

    def load_all_wokers(self):
        return get_class_from_module(self.settings.workers_dir, Worker)

    def create_worker(self, name):
        if name not in self.workers_cls:
            raise Exception(u"no worker with name: %s" % name)
        worker_cls = self.workers_cls[name]
        self.worker = worker_cls(self)

    def run(self):
        logger.info("engine start running")
        self.scheduler.queue_declare(self.worker.queue)
        while True:
            try:
                task = self.scheduler.next_task(self.worker.queue)
                self.current_task = task
                self.worker.execute_before(task)
                response_yield = self.worker.execute(task)
                for response in response_yield:
                    assert isinstance(response, Item) or response is None

                    if isinstance(response, Item):
                        ret = self.worker.pipeline.process_item(response)
                        if ret is not None:
                            task.meta.update(ret)
                    else:
                        task.ack()
                        break

                self.worker.execute_after(task)

            except ExitWithoutDone:
                logger.info("exit without done")
                task.in_queue()
                task.ack()
                self.worker.execute_after(task)

            except ExitWithDone:
                logger.info("task finished")
                task.ack()
                self.worker.execute_after(task)

            except ExitWithDoneNoAck:
                logger.info("exit without done, no ack")
                self.worker.execute_after(task)

            except KeyboardInterrupt, error:
                self.worker.handle_exception(error)
                self.stop()
                return

            except socket.timeout, error:
                self.worker.handle_exception(error)
                logger.exception("socket timeout")
                logger.info("try to reconnect database and rabbitmq")
                reconnect_database()
                engine.scheduler.reconnect()

            except Exception, error:
                ret = self.worker.handle_exception(error)
                if not ret:
                    client.captureException()
                    logger.info("worker not handler")
                    logger.exception("uncaught exception")
                    self.stop()
                    raise
                    return
예제 #18
0
파일: client.py 프로젝트: pijiupapa/crawler
 def ready_for_ppoe(self):
     self.send_ready_signal()
     logger.info("wait ppoe complete")
     self.wait_ppoe_complete()
     self.reset_ready_signal()
예제 #19
0
파일: main.py 프로젝트: pijiupapa/crawler
def close_rabbitmq(sender):
    logger.info("close rabbbitmq")
    engine.scheduler.close()
예제 #20
0
파일: client.py 프로젝트: pijiupapa/crawler
 def request_ppoe(self):
     logger.info("request adsl")
     self.redis.set(REQUEST_PPOE_SIGNAL, 1)
예제 #21
0
파일: client.py 프로젝트: pijiupapa/crawler
 def send_ready_signal(self):
     logger.info("send ready signal")
     self.redis.hset(REGISTER_PIDS, self.get_pid(), READY)
예제 #22
0
 def reconnect(self):
     # self.connection = pika.BlockingConnection(self.parameters)
     # self.channel = self.connection.channel()
     self.init_connection()
     logger.info("rabbitmq connection open state: %s" %
                 self.connection.is_open)
예제 #23
0
 def check_notify_ppoe_signal(self):
     is_will_ppoe = ppoe_client.get_notify_ppoe_signal()
     if is_will_ppoe:
         logger.info("check ppoe will reconnect")
         self.adsl_reconnect()
         raise ExitWithDoneNoAck()
예제 #24
0
 def start(self):
     logger.info("vpn ip: %s" % self.now_pptp_ip)
     self.update_vpn_ip()
예제 #25
0
파일: main.py 프로젝트: pijiupapa/crawler
def open_rabbitmq(sender):
    logger.info("rabbitmq connect")
    engine.scheduler.reconnect()
예제 #26
0
 def close(self):
     logger.info(u"正在关闭rabbitmq")
     # self.channel.close()
     self.connection.close()
예제 #27
0
 def on_ban(self, response):
     logger.info("ip has been banned")
     self.adsl_reconnect()
     raise ExitWithDoneNoAck()