Ejemplo n.º 1
0
def get_job():
    return Job('wikipedia crawler',
               url_patterns,
               MechanizeOpener,
               starts,
               instances=user_config.job.instances,
               user_conf=user_config)
Ejemplo n.º 2
0
    def setUp(self):
        url_patterns = UrlPatterns(
            Url(r'^http://zh.wikipedia.org/wiki/[^(:|/)]+$', 'wiki_item',
                FakeWikiParser))
        fake_user_conf = Config(StringIO(user_conf))

        self.dir = tempfile.mkdtemp()

        self.job = Job(
            'fake wiki crawler',
            url_patterns,
            MechanizeOpener, [
                'http://zh.wikipedia.org/wiki/%E6%97%A0%E6%95%8C%E8%88%B0%E9%98%9F',
            ],
            user_conf=fake_user_conf)

        local_node = 'localhost:%s' % self.job.context.job.port
        nodes = [
            local_node,
        ]

        self.rpc_server = ColaRPCServer(
            ('localhost', self.job.context.job.port))
        self.loader = JobLoader(self.job)
        self.loader.init_mq(self.rpc_server, nodes, local_node, self.dir)

        thd = threading.Thread(target=self.rpc_server.serve_forever)
        thd.setDaemon(True)
        thd.start()
Ejemplo n.º 3
0
 def prepare(self, job_name, unzip=True, overwrite=False, 
             settings=None):
     self.logger.debug('entering worker prepare phase, job id: %s' % job_name)
     if unzip:
         self._unzip(job_name)
     
     src_job_name = job_name
     job_path = os.path.join(self.job_dir, job_name)
     
     if not os.path.exists(job_path):
         return False
     
     job_desc = import_job_desc(job_path)
     if settings is not None:
         job_desc.update_settings(settings)
     
     job_id = self.ctx.ips.index(self.ctx.ip)
     clear = job_desc.settings.job.clear \
                 if self.ctx.is_local_mode else False
     job_name, working_dir = self.ctx._get_name_and_dir(
         self.working_dir, job_name, overwrite=overwrite, clear=clear)
     
     job = Job(self.ctx, job_path, job_name, job_desc=job_desc,
               working_dir=working_dir, rpc_server=self.rpc_server,
               manager=self.ctx.manager, job_offset=job_id)
     t = threading.Thread(target=job.run, args=(True, ))
     
     job_info = WorkerJobInfo(job_name, working_dir)
     job_info.job = job
     job_info.thread = t
     self.running_jobs[src_job_name] = job_info
     
     self.logger.debug('worker prepare phase finished, job id: %s' % job_name)
     return True
Ejemplo n.º 4
0
def get_job():
    return Job('sina weibo crawler',
               url_patterns,
               MechanizeOpener,
               starts,
               is_bundle=True,
               unit_cls=WeiboUserBundle,
               instances=instances,
               debug=False,
               user_conf=user_config,
               login_hook=login_hook)
Ejemplo n.º 5
0
    def setUp(self):
        self.job = Job('test job', UrlPatterns(), BuiltinOpener, [])
        self.root = tempfile.mkdtemp()

        master_root = os.path.join(self.root, 'master')
        worker_root = os.path.join(self.root, 'worker')
        os.makedirs(master_root)
        os.makedirs(worker_root)

        node = '%s:%s' % (get_ip(), self.job.context.job.port)
        nodes = [node]
        master = '%s:%s' % (get_ip(), self.job.context.job.master_port)

        self.master_loader = MasterJobLoader(self.job, master_root, nodes)
        self.worker_loader = WorkerJobLoader(self.job, worker_root, master)
Ejemplo n.º 6
0
def get_job():
    urls = []
    for pattern in user_config.job.patterns:
        url_pattern = Url(pattern.regex,
                          pattern.name,
                          GenericParser,
                          store=pattern.store,
                          extract=pattern.extract)
        urls.append(url_pattern)
    url_patterns = UrlPatterns(*urls)

    return Job(user_config.job.name,
               url_patterns,
               MechanizeOpener,
               starts,
               instances=user_config.job.instances,
               user_conf=user_config)
Ejemplo n.º 7
0
    def setUp(self):
        url_patterns = UrlPatterns(
            Url(r'^http://zh.wikipedia.org/wiki/[^(:|/)]+$', 'wiki_item',
                FakeWikiParser))
        fake_user_conf = Config(StringIO(user_conf))

        self.dir = tempfile.mkdtemp()

        self.job = Job(
            'fake wiki crawler',
            url_patterns,
            MechanizeOpener, [
                'http://zh.wikipedia.org/wiki/%E6%97%A0%E6%95%8C%E8%88%B0%E9%98%9F',
            ],
            user_conf=fake_user_conf)

        self.local_node = 'localhost:%s' % self.job.context.job.port
        self.nodes = [
            self.local_node,
        ]
Ejemplo n.º 8
0
    def _run_local_job(self, job_path, overwrite=False, rpc_server=None, settings=None):
        job_desc = import_job_desc(job_path)
        if settings is not None: job_desc.update_settings(settings)
        base_name = job_desc.uniq_name
        self.env['job_desc'][base_name] = job_desc

        addr_dirname = self.addr.replace('.', '_').replace(':', '_')
        working_dir = os.path.join(self.working_dir, 'worker', addr_dirname)
        clear = job_desc.settings.job.clear
        job_name, working_dir = self._get_name_and_dir(
            working_dir, base_name, overwrite=overwrite, clear=clear)
                    
        clock = Clock()
        job = Job(self, job_path, job_name, job_desc=job_desc,
                  working_dir=working_dir, rpc_server=rpc_server,
                  manager=self.manager)
        t = threading.Thread(target=job.run, args=(True, ))
        t.start()
        
        stopped = multiprocessing.Event()
        def stop(signum, frame):
            if 'main' not in multiprocessing.current_process().name.lower():
                return
            if stopped.is_set():
                return
            else:
                stopped.set()
                
            self.logger.debug("Catch interrupt signal, start to stop")
            job.shutdown()
            if rpc_server:
                rpc_server.shutdown()
            
        signal.signal(signal.SIGINT, stop)
        signal.signal(signal.SIGTERM, stop)
        
        idle_times = 0
        while t.is_alive():
            if job.get_status() == FINISHED:
                break
            if job.get_status() == IDLE:
                idle_times += 1
                if idle_times > MAX_IDLE_TIMES:
                    break
            else:
                idle_times = 0
            
            try:
                t.join(5)
            except IOError:
                break
            
        need_shutdown = False
        if not job.stopped.is_set() and job.get_status() == FINISHED:
            self.logger.debug('All objects have been fetched, try to finish job')
            need_shutdown = True
        elif not stopped.is_set() and not t.is_alive():
            need_shutdown = True
        elif not job.stopped.is_set() and job.get_status() == IDLE:
            self.logger.debug('No bundle or url to perform, try to finish job')
            need_shutdown = True
            
        if need_shutdown is True:
            job.shutdown()
            if rpc_server:
                rpc_server.shutdown()

        self.logger.debug('Job id:%s finished, spend %.2f seconds for running' % (
            job_name, clock.clock()))
Ejemplo n.º 9
0
def get_job():
    return Job('weibo search crawler', url_patterns, get_opener, [],
               is_bundle=True, unit_cls=WeiboSearchBundle, 
               instances=instances, debug=debug, user_conf=user_config,
               login_hook=login_hook)
Ejemplo n.º 10
0
    def _run_local_job(self,
                       job_path,
                       overwrite=False,
                       rpc_server=None,
                       settings=None):
        job_desc = import_job_desc(job_path)
        if settings is not None: job_desc.update_settings(settings)
        base_name = job_desc.uniq_name
        self.env['job_desc'][base_name] = job_desc

        working_dir = os.path.join(self.working_dir, 'worker')
        clear = job_desc.settings.job.clear
        job_name, working_dir = self._get_name_and_dir(working_dir,
                                                       base_name,
                                                       overwrite=overwrite,
                                                       clear=clear)

        clock = Clock()
        job = Job(self,
                  job_path,
                  job_name,
                  job_desc=job_desc,
                  working_dir=working_dir,
                  rpc_server=rpc_server,
                  manager=self.manager)
        t = threading.Thread(target=job.run, args=(True, ))
        t.start()

        stopped = multiprocessing.Event()

        def stop(signum, frame):
            if 'main' not in multiprocessing.current_process().name.lower():
                return
            if stopped.is_set():
                return
            else:
                stopped.set()

            self.logger.debug("Catch interrupt signal, start to stop")
            job.shutdown()
            if rpc_server:
                rpc_server.shutdown()

        signal.signal(signal.SIGINT, stop)
        signal.signal(signal.SIGTERM, stop)

        idle_times = 0
        while t.is_alive():
            if job.get_status() == FINISHED:
                break
            if job.get_status() == IDLE:
                idle_times += 1
                if idle_times > MAX_IDLE_TIMES:
                    break
            else:
                idle_times = 0

            try:
                t.join(5)
            except IOError:
                break

        need_shutdown = False
        if not job.stopped.is_set() and job.get_status() == FINISHED:
            self.logger.debug(
                'All objects have been fetched, try to finish job')
            need_shutdown = True
        elif not stopped.is_set() and not t.is_alive():
            need_shutdown = True
        elif not job.stopped.is_set() and job.get_status() == IDLE:
            self.logger.debug('No bundle or url to perform, try to finish job')
            need_shutdown = True

        if need_shutdown is True:
            job.shutdown()
            if rpc_server:
                rpc_server.shutdown()

        self.logger.debug(
            'Job id:%s finished, spend %.2f seconds for running' %
            (job_name, clock.clock()))