Exemple #1
0
class Spider(object):

    def __init__(self,seed,depth,pool_size=10):
        
        self.seed = seed
        self.depth = depth
        self.all_url_list = [seed]
        self.finished_url_list = []
        self.failure_url_list = []
        self.pool = ThreadPool(pool_size)

    def crawl(self):
        base_deep_size = 0
        while base_deep_size <= self.depth:
            for url in self.all_url_list:
                if url not in self.finished_url_list:
                    self.pool.add_task(self.download,url)
            self.pool.close()
            self.depth-=1

    def download(self,url):
        try:
            data = urllib2.urlopen(url)
            page = data.read()
            self.finished_url_list.append(url)
            links = self.get_urls(page)
            return page,links
        except Exception as e:
            print 'open url:%s raise exception(%s)'%(url,e)
            return None

    def get_urls(self,page):
        soup = BeautifulSoup(page,fromEncoding="gb18030")
        if soup.title:
            print soup.title.string
        links = []
        for item in soup.findAll('a'):
            link=item.get('href')
            if link and link.startswith('http://') and link not in self.finished_url_list:
                links.append(link)
        print links
        return links

    def get_next_url(self):
        pass
Exemple #2
0
def worker_routine(context, idx):
    """ Worker routine """
    global worker_exe_path
    # Socket to talk to dispatcher
    socket0 = context.socket(zmq.REP)

    socket0.connect(url_worker)
    worker_id = 'Worker-%s' % idx
    socket0.setsockopt(zmq.IDENTITY, worker_id)

    socket_exit = context.socket(zmq.SUB)
    socket_exit.connect(url_worker_cmd)
    socket_exit.setsockopt(zmq.SUBSCRIBE, '')
    poller = zmq.Poller()
    poller.register(socket0, zmq.POLLIN)
    poller.register(socket_exit, zmq.POLLIN)

    from threadpool import ThreadPool, WorkRequest
    thread_pool = ThreadPool(thread_num_of_worker)

    def thread_work(client_exe_path, msg):
        # create a process to execute the client_exe
        # TODO: add execute timeout
        _cmd = '%s %s' % (client_exe_path, msg)
        cmd = WrapCommand(_cmd)
        cmd.start()
        logger.debug('Start command start %s' % _cmd)
        cmd.join()
        logger.debug('Start command end %s' % _cmd)
        return cmd

    def thread_work_cb(request, ret):
        if ret.returncode == 0:
            # run cmd process in thread success
            logger.debug("execute %s success %s" %
                         (request.args[0], ret.returncode))
        else:
            logger.error(
                "execute %s failed %s \n%s\n" %
                (request.args[0], ret.returncode, ' '.join(ret.results)))

    def handle_exception(request, exc_info):
        if not isinstance(exc_info, tuple):
            # Something is seriously wrong...
            logger.debug(request)
            logger.debug(exc_info)
            raise SystemExit
        logger.debug("**** Exception occured in request #%s: %s" % \
                (request.requestID, exc_info))

    def register_task(client_exe_path, msg):
        request = WorkRequest(thread_work, (client_exe_path, msg), {},
                              callback=thread_work_cb,
                              exc_callback=handle_exception)
        thread_pool.putRequest(request)

    def get_client_exe_paths(client_name):
        client_exe_paths = []
        for client_id in CONF.clients.clients_list:
            client_conf = getattr(CONF, client_id)
            client_n = client_conf.name
            if client_n == client_name:
                client_exe_paths.append(client_conf.exe_path)
        return client_exe_paths

    while True:
        events = dict(poller.poll(2000))
        if events.get(socket0) == zmq.POLLIN:
            # Deal with message
            _msg = socket0.recv()

            print("Received request: [%s]\n" % (_msg))
            client_name = _msg.split()[0]
            msg = _msg[len(client_name):].strip()
            exe_paths = get_client_exe_paths(client_name)
            logger.debug("Client name \"%s\" message: %s" % (client_name, msg))
            logger.debug("Client exe path %s" % exe_paths)
            if exe_paths is not None:
                for exe_path in exe_paths:
                    register_task(exe_path, msg)
                #send reply back to client
                socket0.send("OK")
            else:
                logger.debug(
                    "Can not execute client, because there have none clients in config file(%s), please check"
                    % str(CONF.clients.clients_list))
                socket0.send('FAILED')

        if events.get(socket_exit) == zmq.POLLIN:
            cmd = socket_exit.recv()
            logger.debug('%s CMD %s' % (worker_id, cmd))
            sys.stdout.flush()
            if cmd == 'EXIT':
                break
            elif cmd == 'CONFIG_CHANGED':
                reload_config()
            else:
                pass
    thread_pool.close()
    thread_pool.join()
    socket0.close()
    socket_exit.close()
Exemple #3
0
def worker_routine(context, idx):
    """ Worker routine """
    global worker_exe_path
    # Socket to talk to dispatcher
    socket0 = context.socket(zmq.REP)

    socket0.connect(url_worker)
    worker_id = 'Worker-%s' % idx
    socket0.setsockopt(zmq.IDENTITY, worker_id)

    socket_exit = context.socket(zmq.SUB)
    socket_exit.connect(url_worker_cmd)
    socket_exit.setsockopt(zmq.SUBSCRIBE, '')
    poller = zmq.Poller()
    poller.register(socket0, zmq.POLLIN)
    poller.register(socket_exit, zmq.POLLIN)

    from threadpool import ThreadPool, WorkRequest
    thread_pool = ThreadPool(thread_num_of_worker)

    def thread_work(client_exe_path, msg):
        # create a process to execute the client_exe
        # TODO: add execute timeout
        _cmd = '%s %s' % (client_exe_path, msg)
        cmd = WrapCommand(_cmd)
        cmd.start()
        logger.debug('Start command start %s' % _cmd)
        cmd.join()
        logger.debug('Start command end %s' % _cmd)
        return cmd

    def thread_work_cb(request, ret):
        if ret.returncode == 0:
            # run cmd process in thread success
            logger.debug("execute %s success %s" % (request.args[0], ret.returncode))
        else:
            logger.error("execute %s failed %s \n%s\n" % (request.args[0], ret.returncode, ' '.join(ret.results)))

    def handle_exception(request, exc_info):
        if not isinstance(exc_info, tuple):
            # Something is seriously wrong...
            logger.debug(request)
            logger.debug(exc_info)
            raise SystemExit
        logger.debug("**** Exception occured in request #%s: %s" % \
                (request.requestID, exc_info))

    def register_task(client_exe_path, msg):
        request = WorkRequest(thread_work, (client_exe_path, msg), {}, callback=thread_work_cb,
                    exc_callback=handle_exception)
        thread_pool.putRequest(request)

    def get_client_exe_paths(client_name):
        client_exe_paths = []
        for client_id in CONF.clients.clients_list:
            client_conf = getattr(CONF, client_id)
            client_n = client_conf.name
            if client_n == client_name:
                client_exe_paths.append(client_conf.exe_path)
        return client_exe_paths

    while True:
        events = dict(poller.poll(2000))
        if events.get(socket0) == zmq.POLLIN:
            # Deal with message
            _msg = socket0.recv()

            print("Received request: [%s]\n" % (_msg))
            client_name = _msg.split()[0]
            msg = _msg[len(client_name):].strip()
            exe_paths = get_client_exe_paths(client_name)
            logger.debug("Client name \"%s\" message: %s" %(client_name, msg))
            logger.debug("Client exe path %s" % exe_paths)
            if exe_paths is not None:
                for exe_path in exe_paths:
                    register_task(exe_path, msg)
                #send reply back to client
                socket0.send("OK")
            else:
                logger.debug("Can not execute client, because there have none clients in config file(%s), please check" % str(CONF.clients.clients_list))
                socket0.send('FAILED')

        if events.get(socket_exit) == zmq.POLLIN:
            cmd = socket_exit.recv()
            logger.debug('%s CMD %s' % (worker_id, cmd))
            sys.stdout.flush()
            if cmd == 'EXIT':
                break
            elif cmd == 'CONFIG_CHANGED':
                reload_config()
            else:
                pass
    thread_pool.close()
    thread_pool.join()
    socket0.close()
    socket_exit.close()