def _inner(): if initial_delay: greenthread.sleep(initial_delay) try: while self._running: idle = self.f(*self.args, **self.kw) if not self._running: break if periodic_interval_max is not None: idle = min(idle, periodic_interval_max) LOG.debug(_('Dynamic looping call sleeping for %.02f seconds at %s Name: %s' % (idle,inspectutils.fuction_full_name(self.f), inspectutils.fuction_class(self.f).m_name))) greenthread.sleep(idle) except LoopingCallDone as e: self.stop() done.send(e.retvalue) except Exception: LOG.exception(_('in dynamic looping call')) done.send_exception(*sys.exc_info()) return else: done.send(True)
def _inner(): if initial_delay: greenthread.sleep(initial_delay) try: while self._running: start = timeutils.utcnow() self.f(*self.args, **self.kw) end = timeutils.utcnow() if not self._running: break delay = interval - timeutils.delta_seconds(start, end) if delay <= 0: LOG.warn(_('task run outlasted interval by %s sec') % -delay) greenthread.sleep(delay if delay > 0 else 0) except LoopingCallDone as e: self.stop() done.send(e.retvalue) except Exception: LOG.exception(_('in fixed duration looping call')) done.send_exception(*sys.exc_info()) return else: done.send(True)
def _start_child(self, wrap): if len(wrap.forktimes) > wrap.workers: # Limit ourselves to one process a second (over the period of # number of workers * 1 second). This will allow workers to # start up quickly but ensure we don't fork off children that # die instantly too quickly. if time.time() - wrap.forktimes[0] < wrap.workers: LOG.info(_('Forking too fast, sleeping')) time.sleep(1) wrap.forktimes.pop(0) wrap.forktimes.append(time.time()) pid = os.fork() if pid == 0: launcher = self._child_process(wrap.service) while True: self._child_process_handle_signal() status, signo = self._child_wait_for_exit_or_signal(launcher) if not _is_sighup(signo): break launcher.restart() os._exit(status) LOG.info(_('Started child %d'), pid) wrap.children.add(pid) self.children[pid] = wrap return pid
def readyForFetch(self, nhost): '''param: nhost(normalize_hostname), if has fake_host, use fake_host return sleep time use in fetcher manager for sleep''' delay = 0 if nhost in self.hostload_delay.keys(): if nhost in self.hostload_exception.keys(): interval = self.hostload_exception[nhost] else: interval = self.default_hostload LOG.debug(_("Get hostload interval host: %(host)s, interval:%(interval)s"), {'host':nhost,'interval':interval}) now = int(timeutils.utcnow_ts()) if now - self.hostload_delay[nhost] < interval: delay = interval + self.hostload_delay[nhost] - now LOG.debug(_("Interval not arrive for host: %(host)s," " now:%(now)s delay:%(delay)s last delay:%(last)s"), {'host':nhost,'now':now,'delay':delay, 'last':self.hostload_delay[nhost]}) self.hostload_delay[nhost] = int(timeutils.utcnow_ts()) else: self.hostload_delay[nhost] = int(timeutils.utcnow_ts()) LOG.debug(_("Add host: %(host)s, delay:%(delay)s"), {'host':nhost,'delay':self.hostload_delay[nhost]}) if delay < 0: delay = 0 return delay
def _wait_child(self): try: # Don't block if no child processes have exited pid, status = os.waitpid(0, os.WNOHANG) if not pid: return None except OSError as exc: if exc.errno not in (errno.EINTR, errno.ECHILD): raise return None if os.WIFSIGNALED(status): sig = os.WTERMSIG(status) LOG.info(_('Child %(pid)d killed by signal %(sig)d'), dict(pid=pid, sig=sig)) else: code = os.WEXITSTATUS(status) LOG.info(_('Child %(pid)s exited with status %(code)d'), dict(pid=pid, code=code)) if pid not in self.children: LOG.warning(_('pid %d not in child list'), pid) return None wrap = self.children.pop(pid) wrap.children.remove(pid) return wrap
def run_periodic_report_tasks(self,service): '''TODO: Read from database''' ''' get fresh crawldoc''' self.wait_for_outputqueue_ready() docs = db_api.getFreshCrawlDoc(self.read_batch_num, self.max_level) if len(docs) < self.read_batch_num * 0.5: ''' get fresh timeout crawldoc ''' timeout_docs = db_api.getTimeoutCrawlDoc( self.crawl_timeout,self.max_timeout_retry_time, self.read_batch_num) docs = docs + timeout_docs if len(docs) < self.read_batch_num * 0.5: ''' get crawl fail crawldoc ''' fail_docs = db_api.getFailCrawlDoc(self.max_fail_retry_time, self.read_batch_num) docs = docs + fail_docs if len(docs) < self.read_batch_num * 0.5: ''' get crawl fail timeout crawldoc''' fail_docs = db_api.getTimeoutFailCrawlDoc(self.crawl_timeout, self.max_fail_retry_time + self.max_timeout_retry_time, self.read_batch_num) # check good crawldoc or not for doc in docs: if not self.checker.checkbefore(doc): LOG.error(_('UnHealthy crawldoc %(crawldoc)s'),{'crawldoc':doc}) continue if not self.filter.Legalurl(doc.url): LOG.error(_('UnLegalurl crawldoc %(crawldoc)s'),{'crawldoc':doc}) continue # url and docid save at db_api.addPendingCrawlDocDict # doc.url = urlutils.normalize(doc.request_url) # doc.docid = mmh3.hash(doc.url) doc.host = urlutils.gethost(doc.url) LOG.debug(_('Export crawldoc %(crawldoc)s'),{'crawldoc':doc}) self.output(doc)
def wait(self): """Loop waiting on children to die and respawning as necessary.""" LOG.debug(_('Full set of CONF:')) CONF.log_opt_values(LOG, std_logging.DEBUG) while True: self.handle_signal() self._respawn_children() if self.sigcaught: signame = _signo_to_signame(self.sigcaught) LOG.info(_('Caught %s, stopping children'), signame) if not _is_sighup(self.sigcaught): break for pid in self.children: os.kill(pid, signal.SIGHUP) self.running = True self.sigcaught = None for pid in self.children: try: os.kill(pid, signal.SIGTERM) except OSError as exc: if exc.errno != errno.ESRCH: raise # Wait for children to die if self.children: LOG.info(_('Waiting on %d children to exit'), len(self.children)) while self.children: self._wait_child()
def ssh_execute(ssh, cmd, process_input=None, addl_env=None, check_exit_code=True): LOG.debug(_('Running cmd (SSH): %s'), cmd) if addl_env: raise InvalidArgumentError(_('Environment not supported over SSH')) if process_input: # This is (probably) fixable if we need it... raise InvalidArgumentError(_('process_input not supported over SSH')) stdin_stream, stdout_stream, stderr_stream = ssh.exec_command(cmd) channel = stdout_stream.channel # NOTE(justinsb): This seems suspicious... # ...other SSH clients have buffering issues with this approach stdout = stdout_stream.read() stderr = stderr_stream.read() stdin_stream.close() exit_status = channel.recv_exit_status() # exit_status == -1 if no exit code was returned if exit_status != -1: LOG.debug(_('Result was %s') % exit_status) if check_exit_code and exit_status != 0: raise ProcessExecutionError(exit_code=exit_status, stdout=stdout, stderr=stderr, cmd=cmd) return (stdout, stderr)
def _wait_for_exit_or_signal(self): status = None signo = 0 LOG.debug(_('Full set of CONF:')) CONF.log_opt_values(LOG, std_logging.DEBUG) try: super(ServiceLauncher, self).wait() except SignalExit as exc: signame = _signo_to_signame(exc.signo) LOG.info(_('Caught %s, exiting'), signame) status = exc.code signo = exc.signo except SystemExit as exc: status = exc.code finally: self.stop() # if rpc: # try: # rpc.cleanup() # except Exception: # # We're shutting down, so it doesn't matter at this point. # LOG.exception(_('Exception during rpc cleanup.')) return status, signo
def output(self,crawldoc): self.wait_for_outputqueue_ready() try: self._m_output_queue.put(crawldoc) LOG.debug(_(" Output One Crawldoc at %(cname)s mid:%(m_id)s, \nCrawlDoc:\n%(doc)s"), {'cname':self.__class__.__name__, 'm_id':self.m_id,'doc':crawldoc}) except AttributeError: LOG.debug(_(" Output queue is None at %(cname)s mid:%(m_id)s"),{'cname':self.__class__.__name__, 'm_id':self.m_id})
def process(self,crawldoc): start_time = time.time() request = Request() request.FillByCrawlDoc(crawldoc) LOG.debug(_("Begin Fetch Crawldoc: %(crawldoc)s"),{'crawldoc':crawldoc}) LOG.debug(_("Begin Fetch Request: %(request)s"),{'request':request}) response = self.fetch(request) LOG.debug(_("Finish Fetch Response: %(response)s"),{'response':response}) response.FillCrawlDoc(crawldoc) LOG.debug(_("Finish Fetch Crawldoc: %(Crawldoc)s using:%(usetime)ss"), {'Crawldoc':crawldoc,'usetime':(time.time()-start_time)}) self.times.append(("%s %s" % (request.method, request.url), start_time, time.time()))
def ProcessCrawlDoc(self, crawldoc): host = crawldoc.host if crawldoc.fake_host: host = crawldoc.fake_host delay = self.hostload.readyForFetch(host) LOG.debug(_("Before ProcessCrawldoc sleep %(sleep)s at %(fetch_id)s crawldoc: %(crawldoc)s"), {'sleep':delay, 'fetch_id':self.m_id, 'crawldoc':crawldoc}) greenthread.sleep(delay) self.client.process(crawldoc) print crawldoc.content crawldoc.crawl_time = int(timeutils.utcnow_ts()) LOG.debug(_("Before ProcessCrawldoc at %(fetch_id)s crawldoc: %(crawldoc)s"), {'fetch_id':self.m_id, 'crawldoc':crawldoc})
def checkafter(self,crawldoc): level,reason = self.getLevelAfter(crawldoc) if level > 0: LOG.info(_('CrawlDoc Check After Healthy: %(level)s Reason:%(reason)s,CrawlDoc:\n%(doc)s'), {'level':_crawldoc_healthy_level[level], 'reason':reason, 'doc':crawldoc}) else: LOG.debug(_('CrawlDoc Check After Healthy: %(level)s Reason:%(reason)s'), {'level':_crawldoc_healthy_level[level], 'reason':reason, 'doc':crawldoc}) if level == 2: return False return True
def request(self, url, method, **kwargs): kwargs.setdefault('headers', kwargs.get('headers', {})) kwargs['headers']['User-Agent'] = self.user_agent kwargs['headers']['Accept'] = ','.join(self.accept_types) if 'body' in kwargs: kwargs['headers']['Content-Type'] = 'application/json' kwargs['data'] = json.dumps(kwargs['body']) del kwargs['body'] # if self.timeout is not None: # kwargs.setdefault('timeout', self.timeout) # kwargs['verify'] = self.verify_cert # kwargs['allow_redirects'] = True # url = urlutils.normalize_url(url) self.http_log_req(method, url, kwargs) resp = self.http.request( method = method, url = url, allow_redirects=True, **kwargs) self.http_log_resp(resp) if resp.encoding == 'none' or resp.encoding == 'ISO-8859-1': resp.encoding = urlutils.get_charset_from_metadata(resp.text) LOG.debug(_("request get encoding: %(encoding)s reason: %(reason)s history: %(history)s" " elapsed: %(elapsed)s cookies: %(cookies)s headers: %(headers)s status_code: %(status_code)s" " url: %(url)s"), {'encoding':resp.encoding, 'reason':resp.reason, 'history':resp.history, 'elapsed':resp.elapsed, 'cookies':resp.cookies, 'headers':resp.headers, 'url':resp.url, 'status_code':resp.status_code}) # LOG.debug(_("request get text: %(text)s"),{'text':resp.text}) return resp
def initialize_if_enabled(): backdoor_locals = { 'exit': _dont_use_this, # So we don't exit the entire process 'quit': _dont_use_this, # So we don't exit the entire process 'fo': _find_objects, 'pgt': _print_greenthreads, 'pnt': _print_nativethreads, } if CONF.backdoor_port is None: return None start_port, end_port = _parse_port_range(str(CONF.backdoor_port)) # NOTE(johannes): The standard sys.displayhook will print the value of # the last expression and set it to __builtin__._, which overwrites # the __builtin__._ that gettext sets. Let's switch to using pprint # since it won't interact poorly with gettext, and it's easier to # read the output too. def displayhook(val): if val is not None: pprint.pprint(val) sys.displayhook = displayhook sock = _listen('localhost', start_port, end_port, eventlet.listen) # In the case of backdoor port being zero, a port number is assigned by # listen(). In any case, pull the port number out here. port = sock.getsockname()[1] LOG.info(_('Eventlet backdoor listening on %(port)s for process %(pid)d') % {'port': port, 'pid': os.getpid()}) eventlet.spawn_n(eventlet.backdoor.backdoor_server, sock, locals=backdoor_locals) return port
def _getPendingCrawldoc(crawlfail = False, delete = False, filters=None, limit=None, sort_key=None, sort_dir='asc'): filters = filters or {} # FIXME(sirp): now that we have the `disabled` field for instance-types, we # should probably remove the use of `deleted` to mark inactive. `deleted` # should mean truly deleted, e.g. we can safely purge the record out of the # database. if crawlfail: models_table = models.CrawlFailPending else: models_table = models.CrawlPending query = model_query(models_table) if 'crawl_status' in filters and filters['crawl_status'] == 'fresh': query = query.filter(or_(models_table.crawl_status == 'fresh', models_table.crawl_status == '')) if 'crawl_status' in filters and filters['crawl_status'] == 'scheduled': query = query.filter(models_table.crawl_status == 'scheduled') if 'crawl_status' in filters and filters['crawl_status'] == 'crawled': query = query.filter(models_table.crawl_status == 'crawled') if 'max_level' in filters: query = query.filter(models_table.level <= filters['max_level']) if 'max_recrawl_time' in filters: query = query.filter( models_table.recrawl_times <= filters['max_recrawl_time']) if 'timeout' in filters: query = query.filter( models_table.schedule_time <= filters['timeout']) LOG.debug(_("get crawldoc sql %(query)s"),{'query':query}) return query.all()
def deprecated(self, msg, *args, **kwargs): stdmsg = _("Deprecated: %s") % msg if CONF.fatal_deprecations: self.critical(stdmsg, *args, **kwargs) raise DeprecatedConfig(msg=stdmsg) else: self.warn(stdmsg, *args, **kwargs)
def __init__(self, manager = None, number = 0): self._input_queue = None self._output_queue = None self.managers = [] self.number = number LOG.info(_("=====================Start %s number:%s===================="% (manager,self.number))) i = 0 while i < self.number: _manager = manager() self.managers.append(_manager) LOG.info(_("Start Manager in Container: %(mname)s id:%(m_id)s"), {'mname':_manager.__class__.__name__,'m_id':_manager.m_id}) i = i + 1 LOG.info(_(60*"=")) self.m_name = 'NA' if len(self.managers) != 0: self.m_name = self.managers[0].__class__.__name__
def __init__(self, manager = None): super(FetcherManagerContainer, self).__init__(manager) self._input_queue = None self._output_queue = None self.m_name = self.__class__.__name__ self.fetcher_managers = [] self.dispatcher = Dispatcher() LOG.info(_("=====================Start FetcherManager number:%s===================="% CONF.fetcher_number)) i = 0 while i < CONF.fetcher_number: manager_class_name = importutils.import_class(CONF.fetcher_manager) manager_class = manager_class_name() self.fetcher_managers.append(manager_class) LOG.info(_("Start Manager: %(mname)s id:%(m_id)s"), {'mname':manager_class.__class__.__name__,'m_id':manager_class.m_id}) i += 1 LOG.info(_(60*"="))
def _pipe_watcher(self): # This will block until the write end is closed when the parent # dies unexpectedly self.readpipe.read() LOG.info(_('Parent process has died unexpectedly, exiting')) sys.exit(1)
def Run(self, context, *args, **kwargs): LOG.info(_(" %(cname)s id:%(m_id)s Start to Run, InputQueue %(input_q)s, OutputQueue %(output_q)s"), {'cname':self.__class__.__name__, 'm_id':self.m_id, 'input_q':self._m_input_queue,'output_q':self._m_output_queue}) while True: if self._m_input_queue == None: LOG.debug(_("Manager %(cname)s inputqueue is None"),{'cname':self.__class__.__name__}) break if self._m_input_queue.empty(): # LOG.debug(_("Manager %(cname)s mid:%(m_id)s inputqueue is Empty"), # {'cname':self.__class__.__name__, 'm_id':self.m_id}) greenthread.sleep(3) continue crawldoc = self._m_input_queue.get() LOG.debug(_(" Get Crawldoc at %(cname)s mid:%(m_id)s, \nCrawlDoc:\n%(doc)s"), {'cname':self.__class__.__name__, 'm_id':self.m_id,'doc':crawldoc}) self.ProcessCrawlDoc(crawldoc) self.output(crawldoc)
def periodic_report_tasks(self, service, raise_on_error=False): '''fix interval task, you can rewrite run_periodic_report_tasks fuction''' try: for fetcher in self.fetcher_managers: fetcher.run_periodic_report_tasks(service) except Exception as e: if raise_on_error: raise LOG.exception(_("Error during %(full_task_name)s: %(e)s"), locals())
def getTimeoutFailCrawlDoc(timeout, max_timeout_time,limit): filters = {'crawl_status':'scheduled', 'max_recrawl_time':max_timeout_time, 'timeout':timeout} docs = _getPendingCrawldoc(crawlfail = True, filters = filters, limit = limit) LOG.debug(_("get Fail Timeout Crawldoc number %(docs)s"),{'docs':len(docs)}) for doc in docs: _updateScheduleDoc(doc.id,doc.recrawl_times,'scheduled',crawlfail = True) return [_format_pending_crawldoc(doc) for doc in docs]
def wait_for_inputqueue_ready(self): while True: if self._m_output_queue == None or \ self._m_input_queue.qsize() < CONF.max_queue_size: break else: greenthread.sleep(3) LOG.debug(_(" Input Queue is Full at %(cname)s mid:%(m_id)s"), {'cname':self.__class__.__name__, 'm_id':self.m_id})
def getFailCrawlDoc(retry_time,limit): ''' param retry_time: get fail doc from crawl_fail_pending which recrawl_times less then retry_time step 1: get doc from crawl_fail_pending which status is crawled or NULL and recrawl_times < retry_time step 2: update recrawltime and crawlstatus and schedule time''' filters = {'crawl_status':'fresh', 'max_recrawl_time':retry_time} docs = _getPendingCrawldoc(crawlfail = True, filters = filters, limit = limit) LOG.debug(_("get Fail Crawldoc number %(docs)s"),{'docs':len(docs)}) for doc in docs: _updateScheduleDoc(doc.id,doc.recrawl_times,'crawled',crawlfail = True) return [_format_pending_crawldoc(doc) for doc in docs]
def getFreshCrawlDoc(limit,level): '''param limit: want get how many crawldocs step1: get limit crawldoc from crawl_pending step2: mark crawlstatus from fresh to scheduled, update recrawltimes and schedule time''' filters = {'crawl_status':'fresh', 'max_level':level} docs = _getPendingCrawldoc(filters = filters, limit = limit, sort_key = 'level') LOG.debug(_("get Fresh Crawldoc number %(docs)s"),{'docs':len(docs)}) for doc in docs: _updateScheduleDoc(doc.id, doc.recrawl_times, 'scheduled', crawlfail = False) return [_format_pending_crawldoc(doc) for doc in docs]
def dispatch(self,crawldoc): # fill fake host try: crawldoc.fake_host = self.hostload.fakeHost(crawldoc.host) LOG.debug(_("Fill fake host from: %(host)s to %(fakehost)s"), {'host':crawldoc.host, 'fakehost':crawldoc.fake_host}) except: crawldoc.fake_host = None disp = getattr(self, 'dispatch_as_%s'%CONF.dispatch_as) return disp(crawldoc)
def checkafter(self, crawldoc): level, reason = self.getLevelAfter(crawldoc) if level > 0: LOG.info( _('CrawlDoc Check After Healthy: %(level)s Reason:%(reason)s,CrawlDoc:\n%(doc)s' ), { 'level': _crawldoc_healthy_level[level], 'reason': reason, 'doc': crawldoc }) else: LOG.debug( _('CrawlDoc Check After Healthy: %(level)s Reason:%(reason)s'), { 'level': _crawldoc_healthy_level[level], 'reason': reason, 'doc': crawldoc }) if level == 2: return False return True
def ProcessCrawlDoc(self, crawldoc): host = crawldoc.host if crawldoc.fake_host: host = crawldoc.fake_host delay = self.hostload.readyForFetch(host) LOG.debug( _("Before ProcessCrawldoc sleep %(sleep)s at %(fetch_id)s crawldoc: %(crawldoc)s" ), { 'sleep': delay, 'fetch_id': self.m_id, 'crawldoc': crawldoc }) greenthread.sleep(delay) self.client.process(crawldoc) print crawldoc.content crawldoc.crawl_time = int(timeutils.utcnow_ts()) LOG.debug( _("Before ProcessCrawldoc at %(fetch_id)s crawldoc: %(crawldoc)s" ), { 'fetch_id': self.m_id, 'crawldoc': crawldoc })
def start(self): LOG.info(_("Begin to loop container.pre_start_hook")) for container in self.containers: LOG.info(_("%(cname)s Pre_start_hook"),{'cname':container.m_name}) container.pre_start_hook(self.context) LOG.info(_("End to loop container.pre_start_hook")) for container in self.containers: '''start container thread Run''' self.tg.add_thread(container.Run, self.context, self.saved_args, self.saved_kwargs) LOG.info(_("Begin to loop container.post_start_hook")) for container in self.containers: LOG.info(_("%(cname)s Post_start_hook"),{'cname':container.m_name}) container.post_start_hook(self.context) LOG.info(_("End to loop container.post_start_hook")) if self.periodic_enable: if self.periodic_fuzzy_delay: initial_delay = random.randint(0, self.periodic_fuzzy_delay) else: initial_delay = None for container in self.containers: ''' start periodic tasks''' self.tg.add_dynamic_timer(container.periodic_tasks, initial_delay=initial_delay, periodic_interval_max= self.periodic_interval_max,context = self.context) if self.report_interval: ''' start report state peridic tasks''' self.tg.add_timer(self.report_interval, self._report_state, self.report_interval, self)
def getFreshCrawlDoc(limit, level): '''param limit: want get how many crawldocs step1: get limit crawldoc from crawl_pending step2: mark crawlstatus from fresh to scheduled, update recrawltimes and schedule time''' filters = {'crawl_status': 'fresh', 'max_level': level} docs = _getPendingCrawldoc(filters=filters, limit=limit, sort_key='level') LOG.debug(_("get Fresh Crawldoc number %(docs)s"), {'docs': len(docs)}) for doc in docs: _updateScheduleDoc(doc.id, doc.recrawl_times, 'scheduled', crawlfail=False) return [_format_pending_crawldoc(doc) for doc in docs]
def getTimeoutCrawlDoc(timeout, max_timeout_time, limit): ''' param timeout: compare with schedule time at crawl_pending to detect timeout crawldoc param max_timeout_time: compare with recrawl_times to decide get or not step1: get status is schedule and timeout doc setp2: update crawl_pending recrawl_times and schedule_time''' filters = {'crawl_status':'scheduled', 'max_recrawl_time':max_timeout_time, 'timeout':timeout} docs = _getPendingCrawldoc(filters = filters, limit = limit) LOG.debug(_("get Timeout Crawldoc number %(docs)s"),{'docs':len(docs)}) for doc in docs: _updateScheduleDoc(doc.id,doc.recrawl_times,'scheduled',crawlfail = False) return [_format_pending_crawldoc(doc) for doc in docs]
def getFailCrawlDoc(retry_time, limit): ''' param retry_time: get fail doc from crawl_fail_pending which recrawl_times less then retry_time step 1: get doc from crawl_fail_pending which status is crawled or NULL and recrawl_times < retry_time step 2: update recrawltime and crawlstatus and schedule time''' filters = {'crawl_status': 'fresh', 'max_recrawl_time': retry_time} docs = _getPendingCrawldoc(crawlfail=True, filters=filters, limit=limit) LOG.debug(_("get Fail Crawldoc number %(docs)s"), {'docs': len(docs)}) for doc in docs: _updateScheduleDoc(doc.id, doc.recrawl_times, 'crawled', crawlfail=True) return [_format_pending_crawldoc(doc) for doc in docs]
def __init__(self, manager=None): super(FetcherManagerContainer, self).__init__(manager) self._input_queue = None self._output_queue = None self.m_name = self.__class__.__name__ self.fetcher_managers = [] self.dispatcher = Dispatcher() LOG.info( _("=====================Start FetcherManager number:%s====================" % CONF.fetcher_number)) i = 0 while i < CONF.fetcher_number: manager_class_name = importutils.import_class(CONF.fetcher_manager) manager_class = manager_class_name() self.fetcher_managers.append(manager_class) LOG.info( _("Start Manager: %(mname)s id:%(m_id)s"), { 'mname': manager_class.__class__.__name__, 'm_id': manager_class.m_id }) i += 1 LOG.info(_(60 * "="))
def run_periodic_report_tasks(self, service): '''TODO: Read from database''' ''' get fresh crawldoc''' self.wait_for_outputqueue_ready() docs = db_api.getFreshCrawlDoc(self.read_batch_num, self.max_level) if len(docs) < self.read_batch_num * 0.5: ''' get fresh timeout crawldoc ''' timeout_docs = db_api.getTimeoutCrawlDoc( self.crawl_timeout, self.max_timeout_retry_time, self.read_batch_num) docs = docs + timeout_docs if len(docs) < self.read_batch_num * 0.5: ''' get crawl fail crawldoc ''' fail_docs = db_api.getFailCrawlDoc(self.max_fail_retry_time, self.read_batch_num) docs = docs + fail_docs if len(docs) < self.read_batch_num * 0.5: ''' get crawl fail timeout crawldoc''' fail_docs = db_api.getTimeoutFailCrawlDoc( self.crawl_timeout, self.max_fail_retry_time + self.max_timeout_retry_time, self.read_batch_num) # check good crawldoc or not for doc in docs: if not self.checker.checkbefore(doc): LOG.error(_('UnHealthy crawldoc %(crawldoc)s'), {'crawldoc': doc}) continue if not self.filter.Legalurl(doc.url): LOG.error(_('UnLegalurl crawldoc %(crawldoc)s'), {'crawldoc': doc}) continue # url and docid save at db_api.addPendingCrawlDocDict # doc.url = urlutils.normalize(doc.request_url) # doc.docid = mmh3.hash(doc.url) doc.host = urlutils.gethost(doc.url) LOG.debug(_('Export crawldoc %(crawldoc)s'), {'crawldoc': doc}) self.output(doc)
def __init__(self, managers, report_interval=None, periodic_enable=None, periodic_fuzzy_delay=None, periodic_interval_max=None, *args, **kwargs): super(MultiServer, self).__init__() self.managers_class_name = managers self.containers = [] LOG.info(_("=====================Start ManagerContainers ====================")) for manager_class_name in self.managers_class_name: if isinstance(manager_class_name, tuple): manager_class = importutils.import_class(manager_class_name[0]) manager_class_number = manager_class_name[1] LOG.info(_("Start Multi %s number %s"%(manager_class,manager_class_number))) else: manager_class = importutils.import_class(manager_class_name) manager_class_number = 1 if issubclass(manager_class, managercontainer.ManagerContainer): manager_or_container = manager_class() self.containers.append(manager_or_container) LOG.info(_("Start ManagerContainer: %(mname)s"), {'mname':manager_or_container.m_name}) else: container = managercontainer.ManagerContainer(manager = manager_class, number = manager_class_number) self.containers.append(container) LOG.info(_(60*"=")) self.report_interval = report_interval self.periodic_enable = periodic_enable self.periodic_fuzzy_delay = periodic_fuzzy_delay self.periodic_interval_max = periodic_interval_max self.saved_args, self.saved_kwargs = args, kwargs self.context = context.get_service_context() self.context.tg = self.tg self.context.periodic_enable = self.periodic_enable self.context.report_interval = self.report_interval self.context.periodic_fuzzy_delay = self.periodic_fuzzy_delay self.context.periodic_interval_max = self.periodic_interval_max LOG.info(_("========================Managers chain=================")) idx = 0 while idx < len(self.containers): LOG.info(_("Chain %(index)s Manager: %(mname)s"), {'index':idx,'mname':self.containers[idx].m_name}) idx += 1 LOG.info(_(60*"=")) length = len(self.containers) if length == 0: raise i = 1 while i < length: '''connect all manager containers, containers hand in hand''' _queue = queue.LightQueue(CONF.max_queue_size) self.containers[i-1].output_queue = _queue self.containers[i].input_queue = _queue i += 1
def ProcessCrawlDoc(self,crawldoc): LOG.debug(_('Get one crawldoc at %(name_)s %(handler_id)s crawldoc %(crawldoc)s'), {'name_':self.__class__.__name__, 'handler_id':self.m_id, 'crawldoc':crawldoc}) self.htmlparser.process(crawldoc) LOG.debug(_('Finish Process one crawldoc at %(name_)s %(handler_id)s crawldoc %(crawldoc)s'), {'name_':self.__class__.__name__, 'handler_id':self.m_id, 'crawldoc':crawldoc}) # show some log if not self.checker.checkafter(crawldoc): return start_time = time.time() if utils.IsCrawlSuccess(crawldoc): db_api.saveSuccessCrawlDoc(crawldoc) LOG.debug(_('Finish Save Success Crawldoc to DB use %(cost)s'), {'cost':(time.time()-start_time)}) else: db_api.saveFailCrawlDoc(crawldoc) LOG.debug(_('Finish Save Success Crawldoc to DB use %(cost)s'), {'cost':(time.time()-start_time)})
def readyForFetch(self, nhost): '''param: nhost(normalize_hostname), if has fake_host, use fake_host return sleep time use in fetcher manager for sleep''' delay = 0 if nhost in self.hostload_delay.keys(): if nhost in self.hostload_exception.keys(): interval = self.hostload_exception[nhost] else: interval = self.default_hostload LOG.debug( _("Get hostload interval host: %(host)s, interval:%(interval)s" ), { 'host': nhost, 'interval': interval }) now = int(timeutils.utcnow_ts()) if now - self.hostload_delay[nhost] < interval: delay = interval + self.hostload_delay[nhost] - now LOG.debug( _("Interval not arrive for host: %(host)s," " now:%(now)s delay:%(delay)s last delay:%(last)s"), { 'host': nhost, 'now': now, 'delay': delay, 'last': self.hostload_delay[nhost] }) self.hostload_delay[nhost] = int(timeutils.utcnow_ts()) else: self.hostload_delay[nhost] = int(timeutils.utcnow_ts()) LOG.debug(_("Add host: %(host)s, delay:%(delay)s"), { 'host': nhost, 'delay': self.hostload_delay[nhost] }) if delay < 0: delay = 0 return delay
def getTimeoutFailCrawlDoc(timeout, max_timeout_time, limit): filters = { 'crawl_status': 'scheduled', 'max_recrawl_time': max_timeout_time, 'timeout': timeout } docs = _getPendingCrawldoc(crawlfail=True, filters=filters, limit=limit) LOG.debug(_("get Fail Timeout Crawldoc number %(docs)s"), {'docs': len(docs)}) for doc in docs: _updateScheduleDoc(doc.id, doc.recrawl_times, 'scheduled', crawlfail=True) return [_format_pending_crawldoc(doc) for doc in docs]
def loadConfToDict(load_dict, conf_list, conf_file, conf_str = ''): '''params:''' LOG.debug(_("%(confstr)s Load from list: %(clist)s, file: %(cfile)s "), {'confstr':conf_str,'clist':conf_list,'cfile':conf_file}) for line in conf_list: info = line.split(',') info[0] = "".join(info[0].split()) if not info[0] in load_dict.keys(): load_dict[info[0]] = info[1] LOG.debug(_("%(confstr)s Load key: %(info0)s, value:%(info1)s"), {'confstr':conf_str,'info0':info[0],'info1':info[1]}) if conf_file and os.path.exists(conf_file): with open(conf_file, 'r') as fp: lines = fp.readlines() for line in lines: info = line.split(',') info[0] = "".join(info[0].split()) if not info[0] in load_dict.keys(): load_dict[info[0]] = info[1] LOG.debug(_("%(confstr)s Load from file key: %(info0)s, value:%(info1)s"), {'confstr':conf_str,'info0':info[0],'info1':info[1]}) else: LOG.debug(_("%(confstr)s conf file not exist conf file:%(cfile)s"), {'confstr':conf_str,'clist':conf_list,'cfile':conf_file})
def Run(self, context, *args, **kwargs): '''TODO: save url to pending db''' urls = [] if self.filter.Legalurl(CONF.init_url): urls.append(CONF.init_url) if CONF.init_url_file != '' and os.path.exists(CONF.init_url_file): with open(CONF.init_url_file, 'r') as fp: lines = fp.readlines() for line in lines: if self.filter.Legalurl(line): urls.append(line) cl = deweight.get_client() for url in urls: docid = mmh3.hash(urlutils.normalize(url)) if not cl.has(docid): db_api.addPendingCrawlDoc(url, 0, 0) LOG.info(_('Add init url %(url)s'), {'url': url})
def getTimeoutCrawlDoc(timeout, max_timeout_time, limit): ''' param timeout: compare with schedule time at crawl_pending to detect timeout crawldoc param max_timeout_time: compare with recrawl_times to decide get or not step1: get status is schedule and timeout doc setp2: update crawl_pending recrawl_times and schedule_time''' filters = { 'crawl_status': 'scheduled', 'max_recrawl_time': max_timeout_time, 'timeout': timeout } docs = _getPendingCrawldoc(filters=filters, limit=limit) LOG.debug(_("get Timeout Crawldoc number %(docs)s"), {'docs': len(docs)}) for doc in docs: _updateScheduleDoc(doc.id, doc.recrawl_times, 'scheduled', crawlfail=False) return [_format_pending_crawldoc(doc) for doc in docs]
def post_start_hook(self, context): for manager in self.fetcher_managers: LOG.info(_("%(cname)s id:%(m_id)s Post_start_hook"), { 'cname': manager.__class__.__name__, 'm_id': manager.m_id }) manager.post_start_hook() ''' start multi fetchers''' if context.periodic_enable: if context.periodic_fuzzy_delay: initial_delay = random.randint(0, context.periodic_fuzzy_delay) else: initial_delay = None for fetcher in self.fetcher_managers: ''' start periodic tasks''' context.tg.add_dynamic_timer( fetcher.run_periodic_tasks, initial_delay=initial_delay, periodic_interval_max=context.periodic_interval_max, context=context)
def _getPendingCrawldoc(crawlfail=False, delete=False, filters=None, limit=None, sort_key=None, sort_dir='asc'): filters = filters or {} # FIXME(sirp): now that we have the `disabled` field for instance-types, we # should probably remove the use of `deleted` to mark inactive. `deleted` # should mean truly deleted, e.g. we can safely purge the record out of the # database. if crawlfail: models_table = models.CrawlFailPending else: models_table = models.CrawlPending query = model_query(models_table) if 'crawl_status' in filters and filters['crawl_status'] == 'fresh': query = query.filter( or_(models_table.crawl_status == 'fresh', models_table.crawl_status == '')) if 'crawl_status' in filters and filters['crawl_status'] == 'scheduled': query = query.filter(models_table.crawl_status == 'scheduled') if 'crawl_status' in filters and filters['crawl_status'] == 'crawled': query = query.filter(models_table.crawl_status == 'crawled') if 'max_level' in filters: query = query.filter(models_table.level <= filters['max_level']) if 'max_recrawl_time' in filters: query = query.filter( models_table.recrawl_times <= filters['max_recrawl_time']) if 'timeout' in filters: query = query.filter(models_table.schedule_time <= filters['timeout']) LOG.debug(_("get crawldoc sql %(query)s"), {'query': query}) return query.all()
def request(self, url, method, **kwargs): kwargs.setdefault('headers', kwargs.get('headers', {})) kwargs['headers']['User-Agent'] = self.user_agent kwargs['headers']['Accept'] = ','.join(self.accept_types) if 'body' in kwargs: kwargs['headers']['Content-Type'] = 'application/json' kwargs['data'] = json.dumps(kwargs['body']) del kwargs['body'] # if self.timeout is not None: # kwargs.setdefault('timeout', self.timeout) # kwargs['verify'] = self.verify_cert # kwargs['allow_redirects'] = True # url = urlutils.normalize_url(url) self.http_log_req(method, url, kwargs) resp = self.http.request(method=method, url=url, allow_redirects=True, **kwargs) self.http_log_resp(resp) if resp.encoding == 'none' or resp.encoding == 'ISO-8859-1': resp.encoding = urlutils.get_charset_from_metadata(resp.text) LOG.debug( _("request get encoding: %(encoding)s reason: %(reason)s history: %(history)s" " elapsed: %(elapsed)s cookies: %(cookies)s headers: %(headers)s status_code: %(status_code)s" " url: %(url)s"), { 'encoding': resp.encoding, 'reason': resp.reason, 'history': resp.history, 'elapsed': resp.elapsed, 'cookies': resp.cookies, 'headers': resp.headers, 'url': resp.url, 'status_code': resp.status_code }) # LOG.debug(_("request get text: %(text)s"),{'text':resp.text}) return resp
def parse(self): url_dict = {} url_parse = urlparse.urlparse(self.url) base_url = urlparse.ParseResult(url_parse.scheme, url_parse.netloc, '/', None, None, None).geturl() base_url_domain = urlutils.getdomain(self.url) LOG.info( _("Url: %(url)s,Get BaseUrl: %(baseurl)s and base_domain:%(basedomain)s" ), { 'url': self.url, 'baseurl': base_url, 'basedomain': base_url_domain }) soup = BeautifulSoup(self.content) for a_tag in soup.findAll('a'): if not a_tag.has_key('href'): continue if a_tag['href'].lower().find('javascript') != -1: continue if CONF.filter_no_follow and a_tag.has_key('nofollow'): continue if CONF.filter_onclick and a_tag.has_key('onclick'): continue new_url = a_tag['href'] if base_url and not new_url.startswith("http"): if new_url.startswith('/'): new_url = new_url[1:] new_url = base_url + new_url ret, reason = self.__filter.filter(new_url) if ret: LOG.info(_("Filter Url: %(url)s,Reason: %(reason)s"), { 'url': new_url, 'reason': reason }) continue if CONF.extract_indomain_link: domain = urlutils.getdomain(new_url) if not domain.lower() == base_url_domain.lower(): LOG.info(_("Filter Url: %(url)s,Reason: NotInDomain"), {'url': new_url}) continue if new_url in url_dict.keys(): if not a_tag.string in url_dict[new_url]: url_dict[new_url].append(a_tag.string) LOG.debug( _("Add outlink Text Url: %(url)s,value: %(value)s"), { 'url': new_url, 'value': url_dict[new_url] }) else: l = list() l.append(a_tag.string) url_dict[new_url] = l LOG.debug(_("Extract Outlink: url: %(url)s,text: %(text)s "), { 'url': new_url, 'text': a_tag.string }) for key, value in url_dict.iteritems(): ol = OutLink(url=key, text='$@$'.join(value)) self.outlinks.append(ol)
def run_periodic_report_tasks(self,service): LOG.info(_(" Periodic report task at %(cname)s id:%(m_id)s"), {'cname':self.__class__.__name__, 'm_id':self.m_id})
def ProcessCrawlDoc(self,crawldoc): LOG.debug(_('Get one crawldoc at %(name_)s %(handler_id)s crawldoc %(crawldoc)s'), {'name_':self.__class__.__name__, 'handler_id':self.m_id, 'crawldoc':crawldoc})