Example #1
0
    def __init__(self, *args, **kwargs):
        # initialize super's attributes
        super(self.__class__, self).__init__(*args, **kwargs)

        # create domo archive
        self.darc = DArcWriter('%s' %
                               os.path.join(settings.ARCHIVES_PATH, self.name))
        # initialize url container
        self.container = UrlsContainer(self.config)
        self.options = self.config.get('options')
Example #2
0
    def __init__(self, *args, **kwargs):
        # initialize super's attributes
        super(self.__class__, self).__init__(*args, **kwargs)

        # create domo archive
        self.darc = DArcWriter("%s" % os.path.join(settings.ARCHIVES_PATH, self.name))
        # initialize url container
        self.container = UrlsContainer(self.config)
        self.options = self.config.get("options")
Example #3
0
class Crawler(Worker):
    def __init__(self, *args, **kwargs):
        # initialize super's attributes
        super(self.__class__, self).__init__(*args, **kwargs)

        # create domo archive
        self.darc = DArcWriter("%s" % os.path.join(settings.ARCHIVES_PATH, self.name))
        # initialize url container
        self.container = UrlsContainer(self.config)
        self.options = self.config.get("options")

    def run(self):
        # renice the process with lowest priority
        os.nice(35)
        # start all
        return self.innerProcess()

    def innerProcess(self):

        self.logger.debug("Crawler started")
        # create curl handles
        multi, freelist = create_multi(self.options)

        maxconnections = len(freelist)

        # main crawling loop
        while True:

            st = self.status.get()
            if st == "paused":
                time.sleep(1)
                continue

            elif st == "ended":
                self.logger.debug("killed. exiting..")
                break

            # status 'running', so we are setting the urls here
            while freelist and self.container.queue:

                crawl_url = self.container.getnext()
                if crawl_url is None:
                    # we have no urls to crawl
                    break

                # curl operations
                c = freelist.pop()
                c.body = StringIO()
                c.url = crawl_url

                c.setopt(pycurl.WRITEFUNCTION, c.body.write)

                c.setopt(pycurl.URL, crawl_url)
                multi.add_handle(c)

            while True:

                # perform fetching, i did never understand it here
                ret, num_handles = multi.perform()
                if ret != pycurl.E_CALL_MULTI_PERFORM:
                    break

            # Check for curl objects which have terminated, and add them to the freelist
            while True:
                num_q, ok_list, err_list = multi.info_read()

                # operating on successfuly fetched urls
                for c in ok_list:
                    # we may have been redirected to some other url
                    current_url = c.getinfo(c.EFFECTIVE_URL)
                    self.logger.debug("Crawled url: %s" % current_url.decode("utf-8"))

                    # remove handle from occupied list, to freelist
                    multi.remove_handle(c)
                    freelist.append(c)

                    # add it whether or not we've fetched it
                    self.container.updatedone([c.url, current_url])

                    content_type = c.getinfo(c.CONTENT_TYPE)
                    if not content_type:
                        # no content_type == no content
                        continue

                    # processing text data
                    source = c.body.getvalue()
                    c.body.close()

                    # write archive file
                    self.darc.write(source, c, self)

                    if content_type.find("text") != -1:
                        pass

                    self.container.process(source, c.url, current_url)

                for c, errno, errmsg in err_list:
                    url = c.getinfo(c.EFFECTIVE_URL)

                    self.container.updatefailed([c.url, url])

                    multi.remove_handle(c)
                    freelist.append(c)

                if num_q == 0:
                    break

            # We just call select() to sleep until some more data is available.
            multi.select(1.0)
            self.report.set(
                done=len(self.container.done), todo=len(self.container.queue), failed=len(self.container.failed)
            )

            if len(freelist) == maxconnections and len(self.container.queue) == 0:
                # ol sed'n dan
                break

        # Cleanup
        for c in freelist:
            if getattr(c, "body", None):
                c.body.close()
                c.body = None
            c.close()
        multi.close()

        # close archive file
        self.darc.close()

        self.logger.info("Crawling ended")
        self.status.set("ended")
Example #4
0
class Crawler(Worker):
    def __init__(self, *args, **kwargs):
        # initialize super's attributes
        super(self.__class__, self).__init__(*args, **kwargs)

        # create domo archive
        self.darc = DArcWriter('%s' %
                               os.path.join(settings.ARCHIVES_PATH, self.name))
        # initialize url container
        self.container = UrlsContainer(self.config)
        self.options = self.config.get('options')

    def run(self):
        # renice the process with lowest priority
        os.nice(35)
        # start all
        return self.innerProcess()

    def innerProcess(self):

        self.logger.debug('Crawler started')
        # create curl handles
        multi, freelist = create_multi(self.options)

        maxconnections = len(freelist)

        # main crawling loop
        while True:

            st = self.status.get()
            if st == 'paused':
                time.sleep(1)
                continue

            elif st == 'ended':
                self.logger.debug('killed. exiting..')
                break

            # status 'running', so we are setting the urls here
            while freelist and self.container.queue:

                crawl_url = self.container.getnext()
                if crawl_url is None:
                    # we have no urls to crawl
                    break

                # curl operations
                c = freelist.pop()
                c.body = StringIO()
                c.url = crawl_url

                c.setopt(pycurl.WRITEFUNCTION, c.body.write)

                c.setopt(pycurl.URL, crawl_url)
                multi.add_handle(c)

            while True:

                # perform fetching, i did never understand it here
                ret, num_handles = multi.perform()
                if ret != pycurl.E_CALL_MULTI_PERFORM:
                    break

            # Check for curl objects which have terminated, and add them to the freelist
            while True:
                num_q, ok_list, err_list = multi.info_read()

                # operating on successfuly fetched urls
                for c in ok_list:
                    # we may have been redirected to some other url
                    current_url = c.getinfo(c.EFFECTIVE_URL)
                    self.logger.debug('Crawled url: %s' %
                                      current_url.decode('utf-8'))

                    # remove handle from occupied list, to freelist
                    multi.remove_handle(c)
                    freelist.append(c)

                    # add it whether or not we've fetched it
                    self.container.updatedone([c.url, current_url])

                    content_type = c.getinfo(c.CONTENT_TYPE)
                    if not content_type:
                        # no content_type == no content
                        continue

                    # processing text data
                    source = c.body.getvalue()
                    c.body.close()

                    # write archive file
                    self.darc.write(source, c, self)

                    if content_type.find('text') != -1:
                        pass

                    self.container.process(source, c.url, current_url)

                for c, errno, errmsg in err_list:
                    url = c.getinfo(c.EFFECTIVE_URL)

                    self.container.updatefailed([c.url, url])

                    multi.remove_handle(c)
                    freelist.append(c)

                if num_q == 0:
                    break

            # We just call select() to sleep until some more data is available.
            multi.select(1.0)
            self.report.set(done=len(self.container.done),
                            todo=len(self.container.queue),
                            failed=len(self.container.failed))

            if len(freelist) == maxconnections and len(
                    self.container.queue) == 0:
                # ol sed'n dan
                break

        # Cleanup
        for c in freelist:
            if getattr(c, 'body', None):
                c.body.close()
                c.body = None
            c.close()
        multi.close()

        # close archive file
        self.darc.close()

        self.logger.info('Crawling ended')
        self.status.set('ended')