Beispiel #1
0
    def __init__(self,
                 uri='mongodb://localhost:27017/',
                 db="_bo",
                 url_collection='_urls',
                 documents_collection='_docs'):

        self._mongo = Mongo(uri, db, url_collection, documents_collection)
        self._job = None
        self.idle = None
        self.reset()
Beispiel #2
0
    def __init__(self,
                 pidfile='/tmp/worker.pid',
                 uri='mongodb://localhost:27017/',
                 db="_bo",
                 url_collection='_urls',
                 documents_collection='_docs',
                 sleep_time=0.1):

        super(Worker, self).__init__(pidfile)
        self._mongo = Mongo(uri, db, url_collection, documents_collection)
        self.sleep_time = sleep_time
        self._callback = None
        self.bandwidth = 0
Beispiel #3
0
    def __init__(
        self, uri="mongodb://localhost:27017/", db="_bo", url_collection="_urls", documents_collection="_docs"
    ):

        self._mongo = Mongo(uri, db, url_collection, documents_collection)
        self._job = None
        self.idle = None
        self.reset()
Beispiel #4
0
class Dispatcher(object):
    def __init__(self,
                 uri='mongodb://localhost:27017/',
                 db="_bo",
                 url_collection='_urls',
                 documents_collection='_docs'):

        self._mongo = Mongo(uri, db, url_collection, documents_collection)
        self._job = None
        self.idle = None
        self.reset()

    def reset(self):
        self._job = '{0}-{1}'.format(str(uuid.uuid4()), str(uuid.uuid4()))
        self.idle = False

    def add_url(self, url):
        url['text'] = '<root>'
        url['job'] = self._job
        url['url'] = url['target_url']
        url['level'] = 0
        self._mongo.add_url(url)

    def load_urls_at_level(self, level):
        documents = self._mongo.get_documents_at_level(self._job, level)
        url_count = 0
        for doc in documents:
            if '_id' in doc:
                del (doc['_id'])
            keys = [
                'target_url',
                'job',
                'allowed_domains',
                'url',
                'level',
            ]
            url = {}
            for key in keys:
                url[key] = doc[key]
            self._mongo.add_url(url)
            url_count += 1
        return url_count

    def dispatch(self, url, clean_job=True):
        '''
        Dispatches the URLs to the workers

        url = {
            'target_url': '',
            'link_level': 0,
            'allowed_domains': [],
        }
        '''
        reqkeys = [
            'target_url',
            'link_level',
            'allowed_domains',
        ]
        for key in reqkeys:
            if key not in url:
                raise Exception('Missing key in URL: %s' % key)

        self.idle = False
        self.add_url(url)
        link_level = url['link_level']
        level = 0
        while level < link_level + 1:
            url_count = self.load_urls_at_level(level)
            working = True
            while working:
                scraped, not_scraped, typed, not_typed = \
                    self._mongo.get_counts(self._job)
                if not_scraped is 0 and not_typed is 0:
                    working = False
                else:
                    time.sleep(1)
                logger.info(("Level: {0} / {1}, Not Scraped: {2},"
                             " Not Typed: {3}").format(level, link_level,
                                                       not_scraped, not_typed))
            level += 1
        if clean_job:
            self._mongo.clean_job(self._job)
        self.idle = True
        logging.info("All URLs processed.")

    def get_documents(self, doc_types=['*']):
        docs = []
        for doc_type in doc_types:
            if doc_type == "*":
                docs = self._mongo.get_all_documents(self._job)
                break
            else:
                for doc in self._mongo.get_documents(self._job, doc_type):
                    docs.append(doc)
        return docs

    def clean_job(self):
        self._mongo.clean_job(self._job)
Beispiel #5
0
class Worker(Daemon):

    def __init__(self,
                 pidfile='/tmp/worker.pid',
                 uri='mongodb://localhost:27017/',
                 db="_bo",
                 url_collection='_urls',
                 documents_collection='_docs',
                 sleep_time=0.1):

        super(Worker, self).__init__(pidfile)
        self._mongo = Mongo(uri, db, url_collection, documents_collection)
        self.sleep_time = sleep_time
        self._callback = None
        self.bandwidth = 0

    def register_callback(self, callback):
        self._callback = callback

    def run(self):
        try:
            self.bandwidth = 0
            self.do_work()
        except Exception as e:
            print(str(e))

    def do_work(self):
        '''
        This function sits until it is told to exit by
        setting self._running to False

        1) try and get a url to scrape
            1a) check if it's an allowed domain
            1b) scrape it and get all of the URLs it links to
            1c) go through all found URLs
                1cI)   check if it's an allowed domain
                1cII)  check if document exists in the collection
                1cIII) add document at level + 1

        Note: we loop until there are no more URLs to scrape

        2) try and get a document to type
            2a) check if it's an allowed domain
            2b) Type the link
            2c) update URL with new type data

        Note: we loop until there are no more documents to type

        '''
        no_work_count = 0
        self._running = True
        while self._running:
            time.sleep(self.sleep_time)
            ''' This loop does page scraping '''
            url = self._mongo.get_url()
            while url is not None:
                logger.info("Scrape: {0}".format(url['url']))
                no_work_count = 0
                if check_match(url, url['url']):
                    page_urls, bandwidth, time_taken = get_page_urls(url)
                    self.bandwidth += bandwidth
                    for pu in page_urls:
                        if check_match(url, pu['url']):
                            document = pu
                            if not self._mongo.check_document_exists(
                                    url, document, use_job=True):
                                self._mongo.add_document(url, document)
                    self._mongo.set_url_scraped(url)
                url = self._mongo.get_url()
            ''' This loop does document typing '''
            document = self._mongo.get_document()
            while document is not None:
                no_work_count = 0
                logger.info('Type: {0}'.format(document['url']))
                if check_match(document, document['url']):
                    doc_type, bad_url, bandwidth, time_taken, count = \
                        type_document(document)
                    self.bandwidth += bandwidth
                    self._mongo.set_document_type(
                        document, doc_type, bad_url,
                        bandwidth, time_taken
                    )
                    if self._callback is not None:
                        self._callback(document)
                document = self._mongo.get_document()
            if no_work_count is 10:
                print("No Work.")
                time.sleep(1)
                no_work_count = 0
            else:
                no_work_count += 1
Beispiel #6
0
class Dispatcher(object):
    def __init__(
        self, uri="mongodb://localhost:27017/", db="_bo", url_collection="_urls", documents_collection="_docs"
    ):

        self._mongo = Mongo(uri, db, url_collection, documents_collection)
        self._job = None
        self.idle = None
        self.reset()

    def reset(self):
        self._job = "{0}-{1}".format(str(uuid.uuid4()), str(uuid.uuid4()))
        self.idle = False

    def add_url(self, url):
        url["text"] = "<root>"
        url["job"] = self._job
        url["url"] = url["target_url"]
        url["level"] = 0
        self._mongo.add_url(url)

    def load_urls_at_level(self, level):
        documents = self._mongo.get_documents_at_level(self._job, level)
        url_count = 0
        for doc in documents:
            if "_id" in doc:
                del (doc["_id"])
            keys = ["target_url", "job", "allowed_domains", "url", "level"]
            url = {}
            for key in keys:
                url[key] = doc[key]
            self._mongo.add_url(url)
            url_count += 1
        return url_count

    def dispatch(self, url, clean_job=True):
        """
        Dispatches the URLs to the workers

        url = {
            'target_url': '',
            'link_level': 0,
            'allowed_domains': [],
        }
        """
        reqkeys = ["target_url", "link_level", "allowed_domains"]
        for key in reqkeys:
            if key not in url:
                raise Exception("Missing key in URL: %s" % key)

        self.idle = False
        self.add_url(url)
        link_level = url["link_level"]
        level = 0
        while level < link_level + 1:
            url_count = self.load_urls_at_level(level)
            working = True
            while working:
                scraped, not_scraped, typed, not_typed = self._mongo.get_counts(self._job)
                if not_scraped is 0 and not_typed is 0:
                    working = False
                else:
                    time.sleep(1)
                logger.info(
                    ("Level: {0} / {1}, Not Scraped: {2}," " Not Typed: {3}").format(
                        level, link_level, not_scraped, not_typed
                    )
                )
            level += 1
        if clean_job:
            self._mongo.clean_job(self._job)
        self.idle = True
        logging.info("All URLs processed.")

    def get_documents(self, doc_types=["*"]):
        docs = []
        for doc_type in doc_types:
            if doc_type == "*":
                docs = self._mongo.get_all_documents(self._job)
                break
            else:
                for doc in self._mongo.get_documents(self._job, doc_type):
                    docs.append(doc)
        return docs

    def clean_job(self):
        self._mongo.clean_job(self._job)