Exemple #1
0
        def _inner():
            if initial_delay:
                greenthread.sleep(initial_delay)

            try:
                while self._running:
                    idle = self.f(*self.args, **self.kw)
                    if not self._running:
                        break

                    if periodic_interval_max is not None:
                        idle = min(idle, periodic_interval_max)
                    LOG.debug(_('Dynamic looping call sleeping for %.02f seconds at %s  Name: %s'
                                    % (idle,inspectutils.fuction_full_name(self.f),
                                        inspectutils.fuction_class(self.f).m_name)))
                    greenthread.sleep(idle)
            except LoopingCallDone as e:
                self.stop()
                done.send(e.retvalue)
            except Exception:
                LOG.exception(_('in dynamic looping call'))
                done.send_exception(*sys.exc_info())
                return
            else:
                done.send(True)
Exemple #2
0
        def _inner():
            if initial_delay:
                greenthread.sleep(initial_delay)

            try:
                while self._running:
                    start = timeutils.utcnow()
                    self.f(*self.args, **self.kw)
                    end = timeutils.utcnow()
                    if not self._running:
                        break
                    delay = interval - timeutils.delta_seconds(start, end)
                    if delay <= 0:
                        LOG.warn(_('task run outlasted interval by %s sec') %
                                 -delay)
                    greenthread.sleep(delay if delay > 0 else 0)
            except LoopingCallDone as e:
                self.stop()
                done.send(e.retvalue)
            except Exception:
                LOG.exception(_('in fixed duration looping call'))
                done.send_exception(*sys.exc_info())
                return
            else:
                done.send(True)
Exemple #3
0
    def _start_child(self, wrap):
        if len(wrap.forktimes) > wrap.workers:
            # Limit ourselves to one process a second (over the period of
            # number of workers * 1 second). This will allow workers to
            # start up quickly but ensure we don't fork off children that
            # die instantly too quickly.
            if time.time() - wrap.forktimes[0] < wrap.workers:
                LOG.info(_('Forking too fast, sleeping'))
                time.sleep(1)

            wrap.forktimes.pop(0)

        wrap.forktimes.append(time.time())

        pid = os.fork()
        if pid == 0:
            launcher = self._child_process(wrap.service)
            while True:
                self._child_process_handle_signal()
                status, signo = self._child_wait_for_exit_or_signal(launcher)
                if not _is_sighup(signo):
                    break
                launcher.restart()

            os._exit(status)

        LOG.info(_('Started child %d'), pid)

        wrap.children.add(pid)
        self.children[pid] = wrap

        return pid
Exemple #4
0
 def readyForFetch(self, nhost):
     '''param: nhost(normalize_hostname), if has fake_host, use fake_host
         return sleep time
         use in fetcher manager for sleep'''
     delay = 0
     if nhost in self.hostload_delay.keys():
         if nhost in self.hostload_exception.keys():
             interval = self.hostload_exception[nhost]
         else:
             interval = self.default_hostload
         LOG.debug(_("Get hostload interval host: %(host)s, interval:%(interval)s"),
                     {'host':nhost,'interval':interval})
         now = int(timeutils.utcnow_ts())
         if now - self.hostload_delay[nhost] < interval:
             delay = interval + self.hostload_delay[nhost] - now
             LOG.debug(_("Interval not arrive for host: %(host)s,"
                         " now:%(now)s delay:%(delay)s last delay:%(last)s"),
                         {'host':nhost,'now':now,'delay':delay,
                          'last':self.hostload_delay[nhost]})                       
         self.hostload_delay[nhost] = int(timeutils.utcnow_ts())
     else:
         self.hostload_delay[nhost] = int(timeutils.utcnow_ts())
         LOG.debug(_("Add host: %(host)s, delay:%(delay)s"),
                     {'host':nhost,'delay':self.hostload_delay[nhost]})            
     if delay < 0: delay = 0
     return delay
Exemple #5
0
    def _wait_child(self):
        try:
            # Don't block if no child processes have exited
            pid, status = os.waitpid(0, os.WNOHANG)
            if not pid:
                return None
        except OSError as exc:
            if exc.errno not in (errno.EINTR, errno.ECHILD):
                raise
            return None

        if os.WIFSIGNALED(status):
            sig = os.WTERMSIG(status)
            LOG.info(_('Child %(pid)d killed by signal %(sig)d'),
                     dict(pid=pid, sig=sig))
        else:
            code = os.WEXITSTATUS(status)
            LOG.info(_('Child %(pid)s exited with status %(code)d'),
                     dict(pid=pid, code=code))

        if pid not in self.children:
            LOG.warning(_('pid %d not in child list'), pid)
            return None

        wrap = self.children.pop(pid)
        wrap.children.remove(pid)
        return wrap
Exemple #6
0
    def run_periodic_report_tasks(self,service):
        '''TODO: Read from database'''
        ''' get fresh crawldoc'''
        self.wait_for_outputqueue_ready()
        docs = db_api.getFreshCrawlDoc(self.read_batch_num, self.max_level)
        if len(docs) < self.read_batch_num * 0.5:
            ''' get fresh timeout crawldoc '''
            timeout_docs = db_api.getTimeoutCrawlDoc(
                                    self.crawl_timeout,self.max_timeout_retry_time,
                                    self.read_batch_num)
            docs = docs + timeout_docs
            if len(docs) < self.read_batch_num * 0.5:
                ''' get crawl fail crawldoc '''
                fail_docs = db_api.getFailCrawlDoc(self.max_fail_retry_time,
                                               self.read_batch_num)
                docs = docs + fail_docs
                if len(docs) < self.read_batch_num * 0.5:
                    ''' get crawl fail timeout crawldoc'''
                    fail_docs = db_api.getTimeoutFailCrawlDoc(self.crawl_timeout,
                                            self.max_fail_retry_time + self.max_timeout_retry_time,
                                            self.read_batch_num)
        # check good crawldoc or not
        for doc in docs:
            if not self.checker.checkbefore(doc):
                LOG.error(_('UnHealthy  crawldoc %(crawldoc)s'),{'crawldoc':doc})
                continue
            if not self.filter.Legalurl(doc.url):
                LOG.error(_('UnLegalurl  crawldoc %(crawldoc)s'),{'crawldoc':doc})
                continue
            # url and docid save at db_api.addPendingCrawlDocDict
#            doc.url = urlutils.normalize(doc.request_url)
#            doc.docid = mmh3.hash(doc.url)
            doc.host = urlutils.gethost(doc.url)
            LOG.debug(_('Export crawldoc %(crawldoc)s'),{'crawldoc':doc})
            self.output(doc)
Exemple #7
0
    def wait(self):
        """Loop waiting on children to die and respawning as necessary."""

        LOG.debug(_('Full set of CONF:'))
        CONF.log_opt_values(LOG, std_logging.DEBUG)

        while True:
            self.handle_signal()
            self._respawn_children()
            if self.sigcaught:
                signame = _signo_to_signame(self.sigcaught)
                LOG.info(_('Caught %s, stopping children'), signame)
            if not _is_sighup(self.sigcaught):
                break

            for pid in self.children:
                os.kill(pid, signal.SIGHUP)
            self.running = True
            self.sigcaught = None

        for pid in self.children:
            try:
                os.kill(pid, signal.SIGTERM)
            except OSError as exc:
                if exc.errno != errno.ESRCH:
                    raise

        # Wait for children to die
        if self.children:
            LOG.info(_('Waiting on %d children to exit'), len(self.children))
            while self.children:
                self._wait_child()
Exemple #8
0
def ssh_execute(ssh, cmd, process_input=None,
                addl_env=None, check_exit_code=True):
    LOG.debug(_('Running cmd (SSH): %s'), cmd)
    if addl_env:
        raise InvalidArgumentError(_('Environment not supported over SSH'))

    if process_input:
        # This is (probably) fixable if we need it...
        raise InvalidArgumentError(_('process_input not supported over SSH'))

    stdin_stream, stdout_stream, stderr_stream = ssh.exec_command(cmd)
    channel = stdout_stream.channel

    # NOTE(justinsb): This seems suspicious...
    # ...other SSH clients have buffering issues with this approach
    stdout = stdout_stream.read()
    stderr = stderr_stream.read()
    stdin_stream.close()

    exit_status = channel.recv_exit_status()

    # exit_status == -1 if no exit code was returned
    if exit_status != -1:
        LOG.debug(_('Result was %s') % exit_status)
        if check_exit_code and exit_status != 0:
            raise ProcessExecutionError(exit_code=exit_status,
                                        stdout=stdout,
                                        stderr=stderr,
                                        cmd=cmd)

    return (stdout, stderr)
Exemple #9
0
    def _wait_for_exit_or_signal(self):
        status = None
        signo = 0

        LOG.debug(_('Full set of CONF:'))
        CONF.log_opt_values(LOG, std_logging.DEBUG)

        try:
            super(ServiceLauncher, self).wait()
        except SignalExit as exc:
            signame = _signo_to_signame(exc.signo)
            LOG.info(_('Caught %s, exiting'), signame)
            status = exc.code
            signo = exc.signo
        except SystemExit as exc:
            status = exc.code
        finally:
            self.stop()
#             if rpc:
#                 try:
#                     rpc.cleanup()
#                 except Exception:
#                     # We're shutting down, so it doesn't matter at this point.
#                     LOG.exception(_('Exception during rpc cleanup.'))

        return status, signo
Exemple #10
0
 def output(self,crawldoc):
     self.wait_for_outputqueue_ready()
     try:
         self._m_output_queue.put(crawldoc)
         LOG.debug(_(" Output One Crawldoc at %(cname)s mid:%(m_id)s, \nCrawlDoc:\n%(doc)s"),
                     {'cname':self.__class__.__name__, 'm_id':self.m_id,'doc':crawldoc})
     except AttributeError:
         LOG.debug(_(" Output queue is None at %(cname)s mid:%(m_id)s"),{'cname':self.__class__.__name__, 'm_id':self.m_id})
Exemple #11
0
 def process(self,crawldoc):
     start_time = time.time()
     request = Request()
     request.FillByCrawlDoc(crawldoc)
     LOG.debug(_("Begin Fetch Crawldoc: %(crawldoc)s"),{'crawldoc':crawldoc})
     LOG.debug(_("Begin Fetch Request: %(request)s"),{'request':request})
     response = self.fetch(request)
     LOG.debug(_("Finish Fetch Response: %(response)s"),{'response':response})
     response.FillCrawlDoc(crawldoc)
     LOG.debug(_("Finish Fetch Crawldoc: %(Crawldoc)s using:%(usetime)ss"),
                 {'Crawldoc':crawldoc,'usetime':(time.time()-start_time)})
     self.times.append(("%s %s" % (request.method, request.url),
                        start_time, time.time()))
Exemple #12
0
 def ProcessCrawlDoc(self, crawldoc):
     host = crawldoc.host
     if crawldoc.fake_host: host = crawldoc.fake_host
     delay = self.hostload.readyForFetch(host)
     LOG.debug(_("Before ProcessCrawldoc sleep %(sleep)s at %(fetch_id)s  crawldoc: %(crawldoc)s"),
               {'sleep':delay,
                'fetch_id':self.m_id,
                'crawldoc':crawldoc})
     greenthread.sleep(delay)
     self.client.process(crawldoc)
     print crawldoc.content
     crawldoc.crawl_time = int(timeutils.utcnow_ts())
     LOG.debug(_("Before ProcessCrawldoc at %(fetch_id)s  crawldoc: %(crawldoc)s"),
               {'fetch_id':self.m_id,
                'crawldoc':crawldoc})
Exemple #13
0
 def checkafter(self,crawldoc):
     level,reason = self.getLevelAfter(crawldoc)
     if level > 0:
         LOG.info(_('CrawlDoc Check After Healthy: %(level)s Reason:%(reason)s,CrawlDoc:\n%(doc)s'),
                  {'level':_crawldoc_healthy_level[level],
                   'reason':reason,
                   'doc':crawldoc})
     else:
         LOG.debug(_('CrawlDoc Check After Healthy: %(level)s Reason:%(reason)s'),
                  {'level':_crawldoc_healthy_level[level],
                   'reason':reason,
                   'doc':crawldoc})
     if level == 2:
         return False
     return True
Exemple #14
0
    def request(self, url, method, **kwargs):
        kwargs.setdefault('headers', kwargs.get('headers', {}))
        kwargs['headers']['User-Agent'] = self.user_agent
        kwargs['headers']['Accept'] = ','.join(self.accept_types)
        if 'body' in kwargs:
            kwargs['headers']['Content-Type'] = 'application/json'
            kwargs['data'] = json.dumps(kwargs['body'])
            del kwargs['body']
#        if self.timeout is not None:
#            kwargs.setdefault('timeout', self.timeout)
#        kwargs['verify'] = self.verify_cert
#         kwargs['allow_redirects'] = True
#        url = urlutils.normalize_url(url)
        self.http_log_req(method, url, kwargs)
        resp = self.http.request(
            method = method,
            url = url,
            allow_redirects=True,
            **kwargs)
        self.http_log_resp(resp)
        if resp.encoding == 'none' or resp.encoding == 'ISO-8859-1':
            resp.encoding = urlutils.get_charset_from_metadata(resp.text)
        LOG.debug(_("request get encoding: %(encoding)s reason: %(reason)s history: %(history)s"
                    " elapsed: %(elapsed)s cookies: %(cookies)s headers: %(headers)s status_code: %(status_code)s"
                    " url: %(url)s"),
                      {'encoding':resp.encoding,
                       'reason':resp.reason,
                       'history':resp.history,
                       'elapsed':resp.elapsed,
                       'cookies':resp.cookies,
                       'headers':resp.headers,
                       'url':resp.url,
                       'status_code':resp.status_code})
#        LOG.debug(_("request get text: %(text)s"),{'text':resp.text})
        return resp
def initialize_if_enabled():
    backdoor_locals = {
        'exit': _dont_use_this,      # So we don't exit the entire process
        'quit': _dont_use_this,      # So we don't exit the entire process
        'fo': _find_objects,
        'pgt': _print_greenthreads,
        'pnt': _print_nativethreads,
    }

    if CONF.backdoor_port is None:
        return None

    start_port, end_port = _parse_port_range(str(CONF.backdoor_port))

    # NOTE(johannes): The standard sys.displayhook will print the value of
    # the last expression and set it to __builtin__._, which overwrites
    # the __builtin__._ that gettext sets. Let's switch to using pprint
    # since it won't interact poorly with gettext, and it's easier to
    # read the output too.
    def displayhook(val):
        if val is not None:
            pprint.pprint(val)
    sys.displayhook = displayhook

    sock = _listen('localhost', start_port, end_port, eventlet.listen)

    # In the case of backdoor port being zero, a port number is assigned by
    # listen().  In any case, pull the port number out here.
    port = sock.getsockname()[1]
    LOG.info(_('Eventlet backdoor listening on %(port)s for process %(pid)d') %
             {'port': port, 'pid': os.getpid()})
    eventlet.spawn_n(eventlet.backdoor.backdoor_server, sock,
                     locals=backdoor_locals)
    return port
Exemple #16
0
def _getPendingCrawldoc(crawlfail = False, delete = False, filters=None,
                       limit=None, sort_key=None, sort_dir='asc'):
    filters = filters or {}

    # FIXME(sirp): now that we have the `disabled` field for instance-types, we
    # should probably remove the use of `deleted` to mark inactive. `deleted`
    # should mean truly deleted, e.g. we can safely purge the record out of the
    # database.

    if crawlfail:
        models_table = models.CrawlFailPending
    else:
        models_table = models.CrawlPending
    query = model_query(models_table)
    if 'crawl_status' in filters and filters['crawl_status'] == 'fresh':
        query = query.filter(or_(models_table.crawl_status == 'fresh',
                                 models_table.crawl_status == ''))
    if 'crawl_status' in filters and filters['crawl_status'] == 'scheduled':
        query = query.filter(models_table.crawl_status == 'scheduled')
    if 'crawl_status' in filters and filters['crawl_status'] == 'crawled':
        query = query.filter(models_table.crawl_status == 'crawled')

    if 'max_level' in filters:
        query = query.filter(models_table.level <= filters['max_level'])

    if 'max_recrawl_time' in filters:
        query = query.filter(
                models_table.recrawl_times <= filters['max_recrawl_time'])    
    if 'timeout' in filters:
        query = query.filter(
                models_table.schedule_time <= filters['timeout'])
    LOG.debug(_("get crawldoc sql %(query)s"),{'query':query})
    
    return query.all()
Exemple #17
0
 def deprecated(self, msg, *args, **kwargs):
     stdmsg = _("Deprecated: %s") % msg
     if CONF.fatal_deprecations:
         self.critical(stdmsg, *args, **kwargs)
         raise DeprecatedConfig(msg=stdmsg)
     else:
         self.warn(stdmsg, *args, **kwargs)
 def __init__(self, manager = None, number = 0):
     self._input_queue = None
     self._output_queue = None
     self.managers = []
     self.number = number
     LOG.info(_("=====================Start %s number:%s===================="% (manager,self.number)))
     i = 0
     while i < self.number:
         _manager = manager()
         self.managers.append(_manager)
         LOG.info(_("Start Manager in Container: %(mname)s id:%(m_id)s"),
                     {'mname':_manager.__class__.__name__,'m_id':_manager.m_id})
         i = i + 1
     LOG.info(_(60*"="))
     self.m_name = 'NA'
     if len(self.managers) != 0:
         self.m_name = self.managers[0].__class__.__name__
 def __init__(self, manager = None):
     super(FetcherManagerContainer, self).__init__(manager)
     self._input_queue = None
     self._output_queue = None
     self.m_name = self.__class__.__name__
     self.fetcher_managers = []
     self.dispatcher = Dispatcher()
     LOG.info(_("=====================Start FetcherManager  number:%s===================="% CONF.fetcher_number))
     i = 0
     while i < CONF.fetcher_number:
         manager_class_name = importutils.import_class(CONF.fetcher_manager)
         manager_class = manager_class_name()
         self.fetcher_managers.append(manager_class)
         LOG.info(_("Start Manager: %(mname)s id:%(m_id)s"),
                     {'mname':manager_class.__class__.__name__,'m_id':manager_class.m_id})
         i += 1
     LOG.info(_(60*"="))
Exemple #20
0
    def _pipe_watcher(self):
        # This will block until the write end is closed when the parent
        # dies unexpectedly
        self.readpipe.read()

        LOG.info(_('Parent process has died unexpectedly, exiting'))

        sys.exit(1)
Exemple #21
0
    def Run(self, context, *args, **kwargs):
        LOG.info(_(" %(cname)s id:%(m_id)s Start to Run, InputQueue %(input_q)s, OutputQueue %(output_q)s"),
                    {'cname':self.__class__.__name__, 'm_id':self.m_id,
                    'input_q':self._m_input_queue,'output_q':self._m_output_queue})
        while True:
            if self._m_input_queue == None:
                LOG.debug(_("Manager %(cname)s inputqueue is None"),{'cname':self.__class__.__name__})
                break
            if self._m_input_queue.empty():
#                LOG.debug(_("Manager %(cname)s mid:%(m_id)s inputqueue is Empty"),
#                            {'cname':self.__class__.__name__, 'm_id':self.m_id})
                greenthread.sleep(3)
                continue
            crawldoc = self._m_input_queue.get()
            LOG.debug(_(" Get Crawldoc at %(cname)s mid:%(m_id)s, \nCrawlDoc:\n%(doc)s"),
                        {'cname':self.__class__.__name__, 'm_id':self.m_id,'doc':crawldoc})
            self.ProcessCrawlDoc(crawldoc)
            self.output(crawldoc)
 def periodic_report_tasks(self, service, raise_on_error=False):
     '''fix interval task, you can rewrite run_periodic_report_tasks fuction'''
     try:
         for fetcher in self.fetcher_managers:
             fetcher.run_periodic_report_tasks(service)
     except Exception as e:
         if raise_on_error:
             raise
         LOG.exception(_("Error during %(full_task_name)s: %(e)s"), locals())
Exemple #23
0
def getTimeoutFailCrawlDoc(timeout, max_timeout_time,limit):
    filters = {'crawl_status':'scheduled',
               'max_recrawl_time':max_timeout_time,
               'timeout':timeout}
    docs = _getPendingCrawldoc(crawlfail = True, filters = filters, limit = limit)
    LOG.debug(_("get Fail Timeout Crawldoc number %(docs)s"),{'docs':len(docs)})
    for doc in docs:
        _updateScheduleDoc(doc.id,doc.recrawl_times,'scheduled',crawlfail = True)
    return [_format_pending_crawldoc(doc) for doc in docs]    
Exemple #24
0
 def wait_for_inputqueue_ready(self):
     while True:
         if self._m_output_queue == None or \
                 self._m_input_queue.qsize() < CONF.max_queue_size:
             break
         else:
             greenthread.sleep(3)
             LOG.debug(_(" Input Queue is Full at %(cname)s mid:%(m_id)s"),
                         {'cname':self.__class__.__name__, 'm_id':self.m_id})
Exemple #25
0
def getFailCrawlDoc(retry_time,limit):
    ''' param retry_time: get fail doc from crawl_fail_pending which recrawl_times less then retry_time
        step 1: get doc from crawl_fail_pending which status is crawled or NULL and recrawl_times < retry_time
        step 2: update recrawltime and crawlstatus and schedule time'''
    filters = {'crawl_status':'fresh',
               'max_recrawl_time':retry_time}
    docs = _getPendingCrawldoc(crawlfail = True, filters = filters, limit = limit)
    LOG.debug(_("get Fail Crawldoc number %(docs)s"),{'docs':len(docs)})
    for doc in docs:
        _updateScheduleDoc(doc.id,doc.recrawl_times,'crawled',crawlfail = True)
    return [_format_pending_crawldoc(doc) for doc in docs]
Exemple #26
0
def getFreshCrawlDoc(limit,level):
    '''param limit: want get how many crawldocs
        step1: get limit crawldoc from crawl_pending 
        step2: mark crawlstatus from fresh to scheduled, update recrawltimes and schedule time'''
    filters = {'crawl_status':'fresh',
               'max_level':level}
    docs = _getPendingCrawldoc(filters = filters, limit = limit, sort_key = 'level')
    LOG.debug(_("get Fresh Crawldoc number %(docs)s"),{'docs':len(docs)})
    for doc in docs:
        _updateScheduleDoc(doc.id, doc.recrawl_times, 'scheduled', crawlfail = False)
    return [_format_pending_crawldoc(doc) for doc in docs]
Exemple #27
0
 def dispatch(self,crawldoc):
     # fill fake host
     try:
         crawldoc.fake_host = self.hostload.fakeHost(crawldoc.host)
         LOG.debug(_("Fill fake host from: %(host)s to %(fakehost)s"),
                     {'host':crawldoc.host,
                      'fakehost':crawldoc.fake_host})
     except:
         crawldoc.fake_host = None
     disp = getattr(self, 'dispatch_as_%s'%CONF.dispatch_as)
     return disp(crawldoc)
Exemple #28
0
 def checkafter(self, crawldoc):
     level, reason = self.getLevelAfter(crawldoc)
     if level > 0:
         LOG.info(
             _('CrawlDoc Check After Healthy: %(level)s Reason:%(reason)s,CrawlDoc:\n%(doc)s'
               ), {
                   'level': _crawldoc_healthy_level[level],
                   'reason': reason,
                   'doc': crawldoc
               })
     else:
         LOG.debug(
             _('CrawlDoc Check After Healthy: %(level)s Reason:%(reason)s'),
             {
                 'level': _crawldoc_healthy_level[level],
                 'reason': reason,
                 'doc': crawldoc
             })
     if level == 2:
         return False
     return True
Exemple #29
0
 def ProcessCrawlDoc(self, crawldoc):
     host = crawldoc.host
     if crawldoc.fake_host: host = crawldoc.fake_host
     delay = self.hostload.readyForFetch(host)
     LOG.debug(
         _("Before ProcessCrawldoc sleep %(sleep)s at %(fetch_id)s  crawldoc: %(crawldoc)s"
           ), {
               'sleep': delay,
               'fetch_id': self.m_id,
               'crawldoc': crawldoc
           })
     greenthread.sleep(delay)
     self.client.process(crawldoc)
     print crawldoc.content
     crawldoc.crawl_time = int(timeutils.utcnow_ts())
     LOG.debug(
         _("Before ProcessCrawldoc at %(fetch_id)s  crawldoc: %(crawldoc)s"
           ), {
               'fetch_id': self.m_id,
               'crawldoc': crawldoc
           })
    def start(self):
        LOG.info(_("Begin to loop container.pre_start_hook"))
        for container in self.containers:
            LOG.info(_("%(cname)s Pre_start_hook"),{'cname':container.m_name})
            container.pre_start_hook(self.context)
        LOG.info(_("End to loop container.pre_start_hook"))

        for container in self.containers:
            '''start container thread Run'''
            self.tg.add_thread(container.Run, self.context, self.saved_args, self.saved_kwargs)

        LOG.info(_("Begin to loop container.post_start_hook"))
        for container in self.containers:
            LOG.info(_("%(cname)s Post_start_hook"),{'cname':container.m_name})
            container.post_start_hook(self.context)
        LOG.info(_("End to loop container.post_start_hook"))

        if self.periodic_enable:
            if self.periodic_fuzzy_delay:
                initial_delay = random.randint(0, self.periodic_fuzzy_delay)
            else:
                initial_delay = None
            for container in self.containers:
                ''' start periodic tasks'''
                self.tg.add_dynamic_timer(container.periodic_tasks,
                                     initial_delay=initial_delay,
                                     periodic_interval_max=
                                        self.periodic_interval_max,context = self.context)
            if self.report_interval:
                ''' start report state peridic tasks'''
                self.tg.add_timer(self.report_interval, self._report_state,
                                 self.report_interval, self)
Exemple #31
0
def getFreshCrawlDoc(limit, level):
    '''param limit: want get how many crawldocs
        step1: get limit crawldoc from crawl_pending 
        step2: mark crawlstatus from fresh to scheduled, update recrawltimes and schedule time'''
    filters = {'crawl_status': 'fresh', 'max_level': level}
    docs = _getPendingCrawldoc(filters=filters, limit=limit, sort_key='level')
    LOG.debug(_("get Fresh Crawldoc number %(docs)s"), {'docs': len(docs)})
    for doc in docs:
        _updateScheduleDoc(doc.id,
                           doc.recrawl_times,
                           'scheduled',
                           crawlfail=False)
    return [_format_pending_crawldoc(doc) for doc in docs]
Exemple #32
0
def getTimeoutCrawlDoc(timeout, max_timeout_time, limit):
    ''' param timeout: compare with schedule time at crawl_pending to detect timeout crawldoc
        param max_timeout_time: compare with recrawl_times to decide get or not
        step1: get status is schedule and timeout doc
        setp2: update crawl_pending recrawl_times and schedule_time'''
    filters = {'crawl_status':'scheduled',
               'max_recrawl_time':max_timeout_time,
               'timeout':timeout}
    docs = _getPendingCrawldoc(filters = filters, limit = limit)
    LOG.debug(_("get Timeout Crawldoc number %(docs)s"),{'docs':len(docs)})
    for doc in docs:
        _updateScheduleDoc(doc.id,doc.recrawl_times,'scheduled',crawlfail = False)
    return [_format_pending_crawldoc(doc) for doc in docs]
Exemple #33
0
def getFailCrawlDoc(retry_time, limit):
    ''' param retry_time: get fail doc from crawl_fail_pending which recrawl_times less then retry_time
        step 1: get doc from crawl_fail_pending which status is crawled or NULL and recrawl_times < retry_time
        step 2: update recrawltime and crawlstatus and schedule time'''
    filters = {'crawl_status': 'fresh', 'max_recrawl_time': retry_time}
    docs = _getPendingCrawldoc(crawlfail=True, filters=filters, limit=limit)
    LOG.debug(_("get Fail Crawldoc number %(docs)s"), {'docs': len(docs)})
    for doc in docs:
        _updateScheduleDoc(doc.id,
                           doc.recrawl_times,
                           'crawled',
                           crawlfail=True)
    return [_format_pending_crawldoc(doc) for doc in docs]
Exemple #34
0
 def __init__(self, manager=None):
     super(FetcherManagerContainer, self).__init__(manager)
     self._input_queue = None
     self._output_queue = None
     self.m_name = self.__class__.__name__
     self.fetcher_managers = []
     self.dispatcher = Dispatcher()
     LOG.info(
         _("=====================Start FetcherManager  number:%s===================="
           % CONF.fetcher_number))
     i = 0
     while i < CONF.fetcher_number:
         manager_class_name = importutils.import_class(CONF.fetcher_manager)
         manager_class = manager_class_name()
         self.fetcher_managers.append(manager_class)
         LOG.info(
             _("Start Manager: %(mname)s id:%(m_id)s"), {
                 'mname': manager_class.__class__.__name__,
                 'm_id': manager_class.m_id
             })
         i += 1
     LOG.info(_(60 * "="))
Exemple #35
0
    def run_periodic_report_tasks(self, service):
        '''TODO: Read from database'''
        ''' get fresh crawldoc'''
        self.wait_for_outputqueue_ready()
        docs = db_api.getFreshCrawlDoc(self.read_batch_num, self.max_level)
        if len(docs) < self.read_batch_num * 0.5:
            ''' get fresh timeout crawldoc '''
            timeout_docs = db_api.getTimeoutCrawlDoc(
                self.crawl_timeout, self.max_timeout_retry_time,
                self.read_batch_num)
            docs = docs + timeout_docs
            if len(docs) < self.read_batch_num * 0.5:
                ''' get crawl fail crawldoc '''
                fail_docs = db_api.getFailCrawlDoc(self.max_fail_retry_time,
                                                   self.read_batch_num)
                docs = docs + fail_docs
                if len(docs) < self.read_batch_num * 0.5:
                    ''' get crawl fail timeout crawldoc'''
                    fail_docs = db_api.getTimeoutFailCrawlDoc(
                        self.crawl_timeout,
                        self.max_fail_retry_time + self.max_timeout_retry_time,
                        self.read_batch_num)
        # check good crawldoc or not
        for doc in docs:
            if not self.checker.checkbefore(doc):
                LOG.error(_('UnHealthy  crawldoc %(crawldoc)s'),
                          {'crawldoc': doc})
                continue
            if not self.filter.Legalurl(doc.url):
                LOG.error(_('UnLegalurl  crawldoc %(crawldoc)s'),
                          {'crawldoc': doc})
                continue
            # url and docid save at db_api.addPendingCrawlDocDict
#            doc.url = urlutils.normalize(doc.request_url)
#            doc.docid = mmh3.hash(doc.url)
            doc.host = urlutils.gethost(doc.url)
            LOG.debug(_('Export crawldoc %(crawldoc)s'), {'crawldoc': doc})
            self.output(doc)
    def __init__(self, managers, report_interval=None,
             periodic_enable=None, periodic_fuzzy_delay=None,
             periodic_interval_max=None, *args, **kwargs):
        super(MultiServer, self).__init__()
        self.managers_class_name = managers
        self.containers = []
        LOG.info(_("=====================Start ManagerContainers ===================="))
        for manager_class_name in self.managers_class_name:
            if isinstance(manager_class_name, tuple):
                manager_class = importutils.import_class(manager_class_name[0])
                manager_class_number = manager_class_name[1]
                LOG.info(_("Start Multi %s  number %s"%(manager_class,manager_class_number)))
            else:
                manager_class = importutils.import_class(manager_class_name)
                manager_class_number = 1
            if issubclass(manager_class, managercontainer.ManagerContainer):
                manager_or_container = manager_class()
                self.containers.append(manager_or_container)
                LOG.info(_("Start ManagerContainer: %(mname)s"), {'mname':manager_or_container.m_name})
            else:
                container = managercontainer.ManagerContainer(manager = manager_class,
                                                              number = manager_class_number)
                self.containers.append(container)
        LOG.info(_(60*"="))
        self.report_interval = report_interval
        self.periodic_enable = periodic_enable
        self.periodic_fuzzy_delay = periodic_fuzzy_delay
        self.periodic_interval_max = periodic_interval_max
        self.saved_args, self.saved_kwargs = args, kwargs
        self.context = context.get_service_context()
        self.context.tg = self.tg
        self.context.periodic_enable = self.periodic_enable
        self.context.report_interval = self.report_interval
        self.context.periodic_fuzzy_delay = self.periodic_fuzzy_delay
        self.context.periodic_interval_max = self.periodic_interval_max
        LOG.info(_("========================Managers chain================="))
        idx = 0
        while idx < len(self.containers):
            LOG.info(_("Chain %(index)s  Manager: %(mname)s"),
                            {'index':idx,'mname':self.containers[idx].m_name})
            idx += 1
        LOG.info(_(60*"="))

        length = len(self.containers)
        if length == 0:
            raise
        i = 1
        while i < length:
            '''connect all manager containers, containers hand in hand'''
            _queue = queue.LightQueue(CONF.max_queue_size)
            self.containers[i-1].output_queue = _queue
            self.containers[i].input_queue = _queue
            i += 1
Exemple #37
0
    def ProcessCrawlDoc(self,crawldoc):
        LOG.debug(_('Get one crawldoc at %(name_)s %(handler_id)s crawldoc %(crawldoc)s'),
                  {'name_':self.__class__.__name__,
                   'handler_id':self.m_id,
                   'crawldoc':crawldoc})
        self.htmlparser.process(crawldoc)
        LOG.debug(_('Finish Process one crawldoc at %(name_)s %(handler_id)s crawldoc %(crawldoc)s'),
                  {'name_':self.__class__.__name__,
                   'handler_id':self.m_id,
                   'crawldoc':crawldoc})
        # show some log
        if not self.checker.checkafter(crawldoc):
            return

        start_time = time.time()
        if utils.IsCrawlSuccess(crawldoc):
            db_api.saveSuccessCrawlDoc(crawldoc)
            LOG.debug(_('Finish Save Success Crawldoc to DB use %(cost)s'),
                        {'cost':(time.time()-start_time)})
        else:
            db_api.saveFailCrawlDoc(crawldoc)
            LOG.debug(_('Finish Save Success Crawldoc to DB use %(cost)s'),
                        {'cost':(time.time()-start_time)})            
Exemple #38
0
 def readyForFetch(self, nhost):
     '''param: nhost(normalize_hostname), if has fake_host, use fake_host
         return sleep time
         use in fetcher manager for sleep'''
     delay = 0
     if nhost in self.hostload_delay.keys():
         if nhost in self.hostload_exception.keys():
             interval = self.hostload_exception[nhost]
         else:
             interval = self.default_hostload
         LOG.debug(
             _("Get hostload interval host: %(host)s, interval:%(interval)s"
               ), {
                   'host': nhost,
                   'interval': interval
               })
         now = int(timeutils.utcnow_ts())
         if now - self.hostload_delay[nhost] < interval:
             delay = interval + self.hostload_delay[nhost] - now
             LOG.debug(
                 _("Interval not arrive for host: %(host)s,"
                   " now:%(now)s delay:%(delay)s last delay:%(last)s"), {
                       'host': nhost,
                       'now': now,
                       'delay': delay,
                       'last': self.hostload_delay[nhost]
                   })
         self.hostload_delay[nhost] = int(timeutils.utcnow_ts())
     else:
         self.hostload_delay[nhost] = int(timeutils.utcnow_ts())
         LOG.debug(_("Add host: %(host)s, delay:%(delay)s"), {
             'host': nhost,
             'delay': self.hostload_delay[nhost]
         })
     if delay < 0: delay = 0
     return delay
Exemple #39
0
def getTimeoutFailCrawlDoc(timeout, max_timeout_time, limit):
    filters = {
        'crawl_status': 'scheduled',
        'max_recrawl_time': max_timeout_time,
        'timeout': timeout
    }
    docs = _getPendingCrawldoc(crawlfail=True, filters=filters, limit=limit)
    LOG.debug(_("get Fail Timeout Crawldoc number %(docs)s"),
              {'docs': len(docs)})
    for doc in docs:
        _updateScheduleDoc(doc.id,
                           doc.recrawl_times,
                           'scheduled',
                           crawlfail=True)
    return [_format_pending_crawldoc(doc) for doc in docs]
Exemple #40
0
def loadConfToDict(load_dict, conf_list, conf_file, conf_str = ''):
    '''params:'''
    LOG.debug(_("%(confstr)s Load from list: %(clist)s, file: %(cfile)s "),
                {'confstr':conf_str,'clist':conf_list,'cfile':conf_file})
    for line in conf_list:
        info = line.split(',')
        info[0] = "".join(info[0].split())
        if not info[0] in load_dict.keys():
            load_dict[info[0]] = info[1]
            LOG.debug(_("%(confstr)s Load key: %(info0)s, value:%(info1)s"),
                        {'confstr':conf_str,'info0':info[0],'info1':info[1]})
    if conf_file and os.path.exists(conf_file):
        with open(conf_file, 'r') as fp:
            lines = fp.readlines()
            for line in lines:
                info = line.split(',')
                info[0] = "".join(info[0].split())
                if not info[0] in load_dict.keys():
                    load_dict[info[0]] = info[1]
                    LOG.debug(_("%(confstr)s Load from file key: %(info0)s, value:%(info1)s"),
                                {'confstr':conf_str,'info0':info[0],'info1':info[1]})
    else:
        LOG.debug(_("%(confstr)s conf file not exist conf file:%(cfile)s"),
                    {'confstr':conf_str,'clist':conf_list,'cfile':conf_file})
Exemple #41
0
 def Run(self, context, *args, **kwargs):
     '''TODO: save url to pending db'''
     urls = []
     if self.filter.Legalurl(CONF.init_url):
         urls.append(CONF.init_url)
     if CONF.init_url_file != '' and os.path.exists(CONF.init_url_file):
         with open(CONF.init_url_file, 'r') as fp:
             lines = fp.readlines()
             for line in lines:
                 if self.filter.Legalurl(line):
                     urls.append(line)
     cl = deweight.get_client()
     for url in urls:
         docid = mmh3.hash(urlutils.normalize(url))
         if not cl.has(docid):
             db_api.addPendingCrawlDoc(url, 0, 0)
             LOG.info(_('Add init url %(url)s'), {'url': url})
Exemple #42
0
def getTimeoutCrawlDoc(timeout, max_timeout_time, limit):
    ''' param timeout: compare with schedule time at crawl_pending to detect timeout crawldoc
        param max_timeout_time: compare with recrawl_times to decide get or not
        step1: get status is schedule and timeout doc
        setp2: update crawl_pending recrawl_times and schedule_time'''
    filters = {
        'crawl_status': 'scheduled',
        'max_recrawl_time': max_timeout_time,
        'timeout': timeout
    }
    docs = _getPendingCrawldoc(filters=filters, limit=limit)
    LOG.debug(_("get Timeout Crawldoc number %(docs)s"), {'docs': len(docs)})
    for doc in docs:
        _updateScheduleDoc(doc.id,
                           doc.recrawl_times,
                           'scheduled',
                           crawlfail=False)
    return [_format_pending_crawldoc(doc) for doc in docs]
Exemple #43
0
 def post_start_hook(self, context):
     for manager in self.fetcher_managers:
         LOG.info(_("%(cname)s id:%(m_id)s Post_start_hook"), {
             'cname': manager.__class__.__name__,
             'm_id': manager.m_id
         })
         manager.post_start_hook()
     ''' start multi fetchers'''
     if context.periodic_enable:
         if context.periodic_fuzzy_delay:
             initial_delay = random.randint(0, context.periodic_fuzzy_delay)
         else:
             initial_delay = None
             for fetcher in self.fetcher_managers:
                 ''' start periodic tasks'''
                 context.tg.add_dynamic_timer(
                     fetcher.run_periodic_tasks,
                     initial_delay=initial_delay,
                     periodic_interval_max=context.periodic_interval_max,
                     context=context)
Exemple #44
0
def _getPendingCrawldoc(crawlfail=False,
                        delete=False,
                        filters=None,
                        limit=None,
                        sort_key=None,
                        sort_dir='asc'):
    filters = filters or {}

    # FIXME(sirp): now that we have the `disabled` field for instance-types, we
    # should probably remove the use of `deleted` to mark inactive. `deleted`
    # should mean truly deleted, e.g. we can safely purge the record out of the
    # database.

    if crawlfail:
        models_table = models.CrawlFailPending
    else:
        models_table = models.CrawlPending
    query = model_query(models_table)
    if 'crawl_status' in filters and filters['crawl_status'] == 'fresh':
        query = query.filter(
            or_(models_table.crawl_status == 'fresh',
                models_table.crawl_status == ''))
    if 'crawl_status' in filters and filters['crawl_status'] == 'scheduled':
        query = query.filter(models_table.crawl_status == 'scheduled')
    if 'crawl_status' in filters and filters['crawl_status'] == 'crawled':
        query = query.filter(models_table.crawl_status == 'crawled')

    if 'max_level' in filters:
        query = query.filter(models_table.level <= filters['max_level'])

    if 'max_recrawl_time' in filters:
        query = query.filter(
            models_table.recrawl_times <= filters['max_recrawl_time'])
    if 'timeout' in filters:
        query = query.filter(models_table.schedule_time <= filters['timeout'])
    LOG.debug(_("get crawldoc sql %(query)s"), {'query': query})

    return query.all()
Exemple #45
0
    def request(self, url, method, **kwargs):
        kwargs.setdefault('headers', kwargs.get('headers', {}))
        kwargs['headers']['User-Agent'] = self.user_agent
        kwargs['headers']['Accept'] = ','.join(self.accept_types)
        if 'body' in kwargs:
            kwargs['headers']['Content-Type'] = 'application/json'
            kwargs['data'] = json.dumps(kwargs['body'])
            del kwargs['body']
#        if self.timeout is not None:
#            kwargs.setdefault('timeout', self.timeout)
#        kwargs['verify'] = self.verify_cert
#         kwargs['allow_redirects'] = True
#        url = urlutils.normalize_url(url)
        self.http_log_req(method, url, kwargs)
        resp = self.http.request(method=method,
                                 url=url,
                                 allow_redirects=True,
                                 **kwargs)
        self.http_log_resp(resp)
        if resp.encoding == 'none' or resp.encoding == 'ISO-8859-1':
            resp.encoding = urlutils.get_charset_from_metadata(resp.text)
        LOG.debug(
            _("request get encoding: %(encoding)s reason: %(reason)s history: %(history)s"
              " elapsed: %(elapsed)s cookies: %(cookies)s headers: %(headers)s status_code: %(status_code)s"
              " url: %(url)s"), {
                  'encoding': resp.encoding,
                  'reason': resp.reason,
                  'history': resp.history,
                  'elapsed': resp.elapsed,
                  'cookies': resp.cookies,
                  'headers': resp.headers,
                  'url': resp.url,
                  'status_code': resp.status_code
              })
        #        LOG.debug(_("request get text: %(text)s"),{'text':resp.text})
        return resp
Exemple #46
0
    def parse(self):
        url_dict = {}
        url_parse = urlparse.urlparse(self.url)
        base_url = urlparse.ParseResult(url_parse.scheme, url_parse.netloc,
                                        '/', None, None, None).geturl()
        base_url_domain = urlutils.getdomain(self.url)
        LOG.info(
            _("Url: %(url)s,Get BaseUrl: %(baseurl)s and base_domain:%(basedomain)s"
              ), {
                  'url': self.url,
                  'baseurl': base_url,
                  'basedomain': base_url_domain
              })

        soup = BeautifulSoup(self.content)
        for a_tag in soup.findAll('a'):
            if not a_tag.has_key('href'):
                continue
            if a_tag['href'].lower().find('javascript') != -1:
                continue
            if CONF.filter_no_follow and a_tag.has_key('nofollow'):
                continue
            if CONF.filter_onclick and a_tag.has_key('onclick'):
                continue

            new_url = a_tag['href']
            if base_url and not new_url.startswith("http"):
                if new_url.startswith('/'):
                    new_url = new_url[1:]
                new_url = base_url + new_url
            ret, reason = self.__filter.filter(new_url)
            if ret:
                LOG.info(_("Filter Url: %(url)s,Reason: %(reason)s"), {
                    'url': new_url,
                    'reason': reason
                })
                continue
            if CONF.extract_indomain_link:
                domain = urlutils.getdomain(new_url)
                if not domain.lower() == base_url_domain.lower():
                    LOG.info(_("Filter Url: %(url)s,Reason: NotInDomain"),
                             {'url': new_url})
                    continue
            if new_url in url_dict.keys():
                if not a_tag.string in url_dict[new_url]:
                    url_dict[new_url].append(a_tag.string)
                    LOG.debug(
                        _("Add outlink Text Url: %(url)s,value: %(value)s"), {
                            'url': new_url,
                            'value': url_dict[new_url]
                        })
            else:
                l = list()
                l.append(a_tag.string)
                url_dict[new_url] = l
            LOG.debug(_("Extract Outlink: url: %(url)s,text: %(text)s "), {
                'url': new_url,
                'text': a_tag.string
            })
        for key, value in url_dict.iteritems():
            ol = OutLink(url=key, text='$@$'.join(value))
            self.outlinks.append(ol)
Exemple #47
0
 def run_periodic_report_tasks(self,service):
     LOG.info(_(" Periodic report task at %(cname)s id:%(m_id)s"),
                 {'cname':self.__class__.__name__, 'm_id':self.m_id})
Exemple #48
0
 def ProcessCrawlDoc(self,crawldoc):
     LOG.debug(_('Get one crawldoc at %(name_)s %(handler_id)s crawldoc %(crawldoc)s'),
               {'name_':self.__class__.__name__,
                'handler_id':self.m_id,
                'crawldoc':crawldoc})