Example #1
0
    def __init__(self, web_server, task_id, manager_address, max_url_depth=1):
        threading.Thread.__init__(self)
        self.status_lock = threading.Lock()
        self.cache_lock = threading.RLock()
        self.data_lock = threading.RLock()
        self.statistics_lock = threading.RLock()

        self.web_server = web_server
        self.manager_address = manager_address
        self.link_db = BerkeleyBTreeLinkDB('link_db', SimpleKeyPolicyModule)
        self.content_db = BerkeleyContentDB('content_db')

        self.crawlers = {}
        self.task_id = task_id
        self.max_links = 0
        self.expire_date = None
        self.mime_type = []
        self.uuid = ''
        self.whitelist = []
        self.blacklist = []
        self.urls_per_min = 0

        self.package_cache = {}
        self.package_id = 0
        self.processing_crawlers = []
        self.max_url_depth = max_url_depth
        
        self.status = Status.INIT

        self.crawled_links = []
        self.stats_reset_time = time.time()

        self.logger = logging.getLogger('server')
        _file_handler = logging.FileHandler('server%s.log' % task_id)
        _formatter = logging.Formatter('<%(asctime)s>:%(levelname)s: %(message)s')
        _file_handler.setFormatter(_formatter)
        self.logger.addHandler(_file_handler)
        self.logger.setLevel(logging.DEBUG)
Example #2
0
 def setup(self):
     self.links_db = BerkeleyBTreeLinkDB("test_db", SimpleKeyPolicyModule)
Example #3
0
class TestLinkDataBase(object):

    def setup(self):
        self.links_db = BerkeleyBTreeLinkDB("test_db", SimpleKeyPolicyModule)

    def test_one_in_base(self):
        _link = "www.test.com"
        self.links_db.add_link(_link, 1, 1)
        assert self.links_db.get_link() is not None

    def test_get_from_empty_base(self):
        assert self.links_db.get_link() is None

    def test_empty_queue(self):
        _link = "www.test.com"
        self.links_db.add_link(_link, 1, 1)
        self.links_db.get_link()
        assert self.links_db.get_link() is None

    def test_priority_set(self):
        _link = "www.test.com"
        self.links_db.add_link(_link, 1, 8)
        _details = self.links_db.get_details(_link)
        assert _details[2] == '8'

    def test_depth_set(self):
        _link = "www.test.com"
        self.links_db.add_link(_link, 6, 4)
        _details = self.links_db.get_details(_link)
        assert _details[0] == '6'

    def test_is_in_base(self):
        _link = "www.test.com"
        self.links_db.add_link(_link, 1, 1)
        assert self.links_db.is_in_base(_link) is True

    def test_is_not_in_base(self):
        _link1 = "www.test.com"
        _link2 = "www.test22222.com"
        self.links_db.add_link(_link1, 1, 1)
        assert self.links_db.is_in_base(_link2) is False

    def test_best_priority_in_queue1(self):
        '''
        Kolejnosc liter.
        '''
        _link = "www.aaa.com"
        self.links_db.add_link("www.abc.com", 1, 3)
        self.links_db.add_link(_link, 1, 1)
        assert self.links_db.get_link() == _link

    def test_best_priority_in_queue2(self):
        """
        Dluzszy link.
        """
        _link = "www.aaa.com"
        self.links_db.add_link(_link, 1, 1)
        self.links_db.add_link("www.aaa.com/aaaa/aa", 1, 3)
        assert self.links_db.get_link() == _link

    def test_best_priority_in_queue3(self):
        """
        Link krotszy, ale kolejnosc liter.
        """
        _link = "www.aaa.com"
        self.links_db.add_link(_link, 1, 1)
        self.links_db.add_link("www.zz.com", 1, 3)
        assert self.links_db.get_link() == _link

    def test_best_priority_in_queue4(self):
        """
        Priorytet.
        """
        _link = "www.aaa.com"
        self.links_db.add_link(_link, 1, 1)
        _link2 = "www.zz.com"
        self.links_db.add_link("www.zz.com", 4, 3)
        assert self.links_db.get_link() == _link2

    def test_best_priority_in_queue5(self):
        """
        Priorytety o roznych liczbach cyfr.
        """
        _link = "www.zzz.com"
        self.links_db.add_link(_link, 12, 1)
        self.links_db.add_link("www.aaa.com", 4, 3)
        assert self.links_db.get_link() == _link

    def test_best_priority_in_queue6(self):
        """
        Priorytety o roznych liczbach cyfr.
        """
        _link = "www.zzz.com"
        self.links_db.add_link(_link, 12, 1)
        self.links_db.add_link("www.aaa.com/asasf/34", 4, 3)
        assert self.links_db.get_link() == _link

    def test_set_as_fetched(self):
        _link = "www.zzz.com"
        self.links_db.add_link(_link, 12, 1)
        _t0 = datetime.datetime.now()
        self.links_db.set_as_fetched(_link)
        _details = self.links_db.get_details(_link)
        assert _t0 < datetime.datetime.strptime(_details[1], "%Y-%m-%d %H:%M:%S.%f")

    def test_feedback(self):
        _link = "www.zzz.com"
        self.links_db.add_link(_link, 12, 1)
        self.links_db.change_link_priority(_link, 5)
        _details = self.links_db.get_details(_link)
        assert _details[0] == '5'

    def test_feedback_correct_keys(self):
        _link = "www.zzz.com"
        self.links_db.add_link(_link, 12, 1)
        self.links_db.change_link_priority(_link, 5)
        assert self.links_db.policy_module.generate_key(_link, 5) in self.links_db.priority_queue

    def teardown(self):
        self.links_db.clear()
Example #4
0
class TaskServer(threading.Thread):
    def __init__(self, web_server, task_id, manager_address, max_url_depth=1):
        threading.Thread.__init__(self)
        self.status_lock = threading.Lock()
        self.cache_lock = threading.RLock()
        self.data_lock = threading.RLock()
        self.statistics_lock = threading.RLock()

        self.web_server = web_server
        self.manager_address = manager_address
        self.link_db = BerkeleyBTreeLinkDB('link_db', SimpleKeyPolicyModule)
        self.content_db = BerkeleyContentDB('content_db')

        self.crawlers = {}
        self.task_id = task_id
        self.max_links = 0
        self.expire_date = None
        self.mime_type = []
        self.uuid = ''
        self.whitelist = []
        self.blacklist = []
        self.urls_per_min = 0

        self.package_cache = {}
        self.package_id = 0
        self.processing_crawlers = []
        self.max_url_depth = max_url_depth
        
        self.status = Status.INIT

        self.crawled_links = []
        self.stats_reset_time = time.time()

        self.logger = logging.getLogger('server')
        _file_handler = logging.FileHandler('server%s.log' % task_id)
        _formatter = logging.Formatter('<%(asctime)s>:%(levelname)s: %(message)s')
        _file_handler.setFormatter(_formatter)
        self.logger.addHandler(_file_handler)
        self.logger.setLevel(logging.DEBUG)

    def assign_crawlers(self, assignment):
        """
        Sets actual crawler assignment.

        Task server can send crawling requests only to these crawlers
        and size of packages must be specified in assignment dict for each crawler.
        It allows to control crawling efficiency of all task servers.
        """
        self.data_lock.acquire()
        self.crawlers = assignment
        self.data_lock.release()
        self.logger.debug('%d crawlers assigned' % len(assignment))

    def assign_speed(self, speed):
        """
        Sets task server's crawling speed.

        After each speed change statistics are reset.
        """
        self.data_lock.acquire()
        self.urls_per_min = int(speed)
        self._reset_stats()
        self.data_lock.release()
        self.logger.debug('Changed speed to %d', self.urls_per_min)

    def get_address(self):
        # TODO: change this address to external ip address (and in crawler too)
        return 'http://' + self.web_server.get_host()

    def _set_status(self, status):
        """
        Sets task server state.
        """
        self.status_lock.acquire()
        self.status = status
        self.status_lock.release()
        self.logger.debug('Changed status to: %d' % status)

    def _get_status(self):
        """
        Returns actual task server state.
        """
        self.status_lock.acquire()
        _status = self.status
        self.status_lock.release()
        return _status

    def _register_to_management(self):
        """
        Sends register request to FCS main application.

        Received data is used to set crawling parameters.
        """
        r = requests.post(self.manager_address + '/autoscale/server/register/',
                                data={'task_id': self.task_id, 'address': self.get_address()})
        self.logger.debug('Registering to management. Return code: %d, message: %s' % (r.status_code, r.content))
        if r.status_code in [status.HTTP_412_PRECONDITION_FAILED, status.HTTP_404_NOT_FOUND]:
            self.stop()
            return

        self._set_status(Status.RUNNING)
        try:
            data = r.json()
            self.update(data)
            self.add_links(data['start_links'], self.link_db.DEFAULT_PRIORITY, 0)
            self.data_lock.acquire()
            self.uuid = data['uuid']
            self.data_lock.release()
            self.logger.debug('Registered to management')
        except (KeyError, ValueError) as e:
            self.logger.debug('Error while registering: %s' % str(e))
            self.stop()

    def _unregister_from_management(self):
        """
        Sends unregister request to FCS main application.
        """
        #TODO: check why precondition failed (task not stopped)
        r = requests.post(self.manager_address + '/autoscale/server/unregister/',
                          data={'task_id': self.task_id, 'uuid': self.uuid})
        self.logger.debug('Unregistering from management. Return code: %d, message: %s' % (r.status_code, r.content))

    def update(self, data):
        """
        Updates crawling parameters and status.

        It is called usually when user makes some changes in task data using GUI or API.
        """
        self.logger.debug('Updating task server: %s' % json.dumps(data))
        if self._get_status() in [Status.STOPPING, Status.STARTING]:
            return
        if data['finished']:
            self.stop()
            return

        self.data_lock.acquire()
        self.whitelist = data['whitelist']
        self.blacklist = data['blacklist']
        self.mime_type = data['mime_type']
        self.max_links = int(data['max_links'])
        self.expire_date = datetime.strptime(data['expire_date'], DATE_FORMAT)
        self.data_lock.release()

        if data['active']:
            self.resume()
        else:
            self.pause()

    def pause(self):
        """
        Pauses task server if it was running.
        """
        if self._get_status() == Status.RUNNING:
            self._set_status(Status.PAUSED)

    def resume(self):
        """
        Resumes task server if it was paused.
        """
        if self._get_status() == Status.PAUSED:
            self._set_status(Status.RUNNING)

    def stop(self):
        """
        Sets STOPPING status

        Task server in this state won't send crawling requests anymore.
        It will wait WAIT_FOR_DOWNLOAD_TIME seconds for user to download gathered data.
        """
        self._set_status(Status.STOPPING)

    def kill(self):
        """
        Sets KILLED status

        Task server in this state will be stopped as soon as possible
        """
        self._set_status(Status.KILLED)

    def run(self):
        """
        Main task server loop
        """
        self._set_status(Status.STARTING)
        self.web_server.start()
        self._register_to_management()
        try:
            while self._get_status() not in [Status.STOPPING, Status.KILLED]:
                if self._get_status() == Status.RUNNING and not self._efficiency_achieved():
                    for crawler in self.get_idle_crawlers():
                        package = self._get_links_package(crawler, self.crawlers[crawler])
                        if package:
                            try:
                                requests.post(crawler + '/put_links', json.dumps(package))
                                self.logger.debug('Sending links to crawler %s' % crawler)
                            except ConnectionError:
                                pass
                self._check_cache()
                self._check_limits()
                self._clear_stats()
                time.sleep(1)

            shutdown_time = time.time()
            while (time.time() - shutdown_time) < WAIT_FOR_DOWNLOAD_TIME and self.content_db.size() > 0 \
                    and self._get_status() != Status.KILLED:
                time.sleep(30)
        finally:
            self._unregister_from_management()
            #check what happens here sometimes that server doesn't shutdown
            self._clear()
            self.logger.debug('Stopping web interface')
            self.web_server.stop()
            self.logger.debug('Task server stopped')

    def get_idle_crawlers(self):
        """
        Returns list of crawlers which are not processing any requests.
        """
        self.data_lock.acquire()
        crawlers = self.crawlers
        self.data_lock.release()
        self.cache_lock.acquire()
        processing = self.processing_crawlers
        self.cache_lock.release()
        return [crawler for crawler in crawlers if crawler not in processing]

    def _check_limits(self):
        """
        Checks if crawling limits are exceeded or no link left to crawl.
        If so, tries to stop task.
        """
        self.data_lock.acquire()
        expire_date = self.expire_date
        max_links = self.max_links
        self.data_lock.release()
        self.cache_lock.acquire()
        packages_cached = len(self.package_cache)
        self.cache_lock.release()
        if datetime.now() > expire_date:
            self.logger.debug('Task expired')
            self._stop_task()
        elif self.content_db.added_records_num() > max_links:
            self.logger.debug('Task max links limit exceeded')
            self._stop_task()
        elif self.link_db.size() == 0 and packages_cached == 0:
            self.logger.debug('No links to crawl')
            self._stop_task()

    def _stop_task(self):
        """
        Sends request to FCS main application to stop this task server's task.

        If case of error task server will be killed.
        """
        r = requests.post(self.manager_address + '/autoscale/server/stop_task/',
                          data={'task_id': self.task_id, 'uuid': self.uuid})
        self.logger.debug('Stopping task. Return code: %d, message: %s' % (r.status_code, r.content))
        if r.status_code in [status.HTTP_412_PRECONDITION_FAILED, status.HTTP_404_NOT_FOUND]:
            self.kill()

    def _cache(self, package_id, crawler, links):
        """
        Puts link package into cache and marks assigned crawler as 'processing'
        """
        self.cache_lock.acquire()
        has_timed_out = False
        self.package_cache[package_id] = [time.time(), links, crawler, has_timed_out]
        self.processing_crawlers.append(crawler)
        self.cache_lock.release()
        self.logger.debug('Cached package %d' % package_id)

    def _get_links_package(self, crawler, size):
        """
        Prepares link package of given size for given crawler and caches it.
        """
        _links = []
        for i in range(size):
            _links.append(self.link_db.get_link())
        _links = [link for link in _links if link]
        self.logger.debug('Retrieved %d links from linkdb' % len(_links))
        if _links:
            address = self.get_address()
            crawling_type = self.mime_type
            package_id = self.package_id
            #TODO: change 'crawling_type' to 'mime_type'
            package = {'server_address': address, 'crawling_type': crawling_type, 'id': package_id, 'links': _links}
            self._cache(package_id, crawler, _links)
            self.package_id += 1
            return package
        else:
            return None

    def _check_cache(self):
        """
        Scans package cache looking for timed out packages.

        Marks timed out packages as such
        and again puts links that they contained into database.
        """
        cur_time = time.time()
        self.cache_lock.acquire()
        for package_id in self.package_cache.keys():
            if (cur_time - self.package_cache[package_id][0] > URL_PACKAGE_TIMEOUT) and \
                    not self.package_cache[package_id][3]:
                self.logger.debug('Package %d has timed out. Readding' % package_id)
                self.readd_links(self.package_cache[package_id][1])
                self.package_cache[package_id][3] = True
            elif (cur_time - self.package_cache[package_id][0]) > 5 * URL_PACKAGE_TIMEOUT:
                self._clear_cache(package_id)
        self.cache_lock.release()

    def _clear_cache(self, package_id):
        """
        Removes entry from package cache for given package id.

        It also marks crawler which was assigned to this crawling request as 'idle'
        so next request can be sent to this crawler.
        """
        self.cache_lock.acquire()
        try:
            self.processing_crawlers.remove(self.package_cache[package_id][2])
            del self.package_cache[package_id]
            self.logger.debug('Removed package %d from cache' % package_id)
        except KeyError:
            pass
        self.cache_lock.release()

    def feedback(self, regex, rate):
        # TODO: change this method to feedback regex (which will be created soon)
        #self.link_db.change_link_priority(regex, rate)
        pass

    def _evaluate_link(self, link):
        """
        Checks blacklist and whitelist and decides
        if link can be put into link queue.
        """
        domain = urlparse(link).netloc
        if not domain:
            self.logger.debug('Link evaluation failed. Bad link format: %s' % link)
            return False

        for regex in self.blacklist:
            if re.match(regex, domain):
                return False
        for regex in self.whitelist:
            if re.match(regex, domain):
                return True
        return False

    def add_links(self, links, priority, depth=0, source_url=""):
        _counter = 0
        self.logger.debug('Trying to add %d links' % len(links))
        for link in links:
            _link = URLProcessor.validate(link, source_url)
            if self._evaluate_link(_link) and not self.link_db.is_in_base(_link):
                #_depth = SimpleCrawlingDepthPolicy.calculate_depth(link, source_url, depth)
                #_depth = RealDepthCrawlingDepthPolicy.calculate_depth(link, self.link_db)
                _depth = IgnoreDepthPolicy.calculate_depth()
                if _depth <= self.max_url_depth:
                    self.logger.debug("Added:%s with priority %d" % (_link, _depth))
                    self.link_db.add_link(_link, priority, _depth)
                    _counter += 1
        self.logger.debug("Added %d new links into DB." % _counter)

    def readd_links(self, links):
        for link in links:
            # adds link only when it was earlier in linkdb
            # TODO : ???
            self.link_db.change_link_priority(link, BerkeleyBTreeLinkDB.BEST_PRIORITY)

    def _decode_content(self, content):
        return Base64ContentCoder.decode(content)

    def put_data(self, package_id, data):
        """
        Handles data package received from crawler and puts it into content database

        If received package isn't in package cache (or crawling request has timed out)
        no data will be stored in database.
        It also marks crawler which was assigned to this crawling request as 'idle'
        so next request can be sent to this crawler.
        """
        if package_id in self.package_cache:
            has_timed_out = self.package_cache[package_id][3]
            if not has_timed_out:
                self.logger.debug('Putting content from package %d' % package_id)
                start_time = self.package_cache[package_id][0]
                end_time = time.time()
                self._add_stats(start_time, end_time, len(data))
                for entry in data:
                    self.logger.debug('Adding content from url %s' % entry['url'])
                    self.content_db.add_content(entry['url'], entry['links'], entry['content'])
                    # TODO: put correct depth and priority value (based on previous url)
                    _details = self.link_db.get_details(entry['url'])
                    _url_depth = _details is not None and _details[2] or 0
                    self.add_links(entry['links'], BerkeleyBTreeLinkDB.DEFAULT_PRIORITY, _url_depth, entry['url'])
            self._clear_cache(package_id)

    def _clear(self):
        """
        Clears database files
        """
        self.logger.debug('Clearing db files')
        self.link_db.clear()
        self.content_db.clear()

    def get_data(self, size):
        """
        Returns path to file with crawling results.
        """
        self.logger.debug('Downloading content - %d size' % size)
        return self.content_db.get_file_with_data_package(size)

    def _add_stats(self, start_time, end_time, links):
        """
        Adds new statistics entry - links number with current time
        """
        self.statistics_lock.acquire()
        self.crawled_links.append((start_time, end_time, links))
        self.statistics_lock.release()

    def _reset_stats(self):
        """
        Removes all statistics entries and
        resets time from which statistics can be measured.
        """
        self.statistics_lock.acquire()
        self.crawled_links = []
        self.stats_reset_time = time.time()
        self.statistics_lock.release()

    def _get_stats(self, seconds):
        """
        Returns statistics summarise for the last seconds in a dict.

        Dict keys:
        'seconds' - actual number of seconds for which measurement was done
        'links' - number of links crawled during this time
        'speed' - crawling speed assigned to this task server
        """
        self.statistics_lock.acquire()
        now = time.time()
        from_time = now - seconds
        if self.stats_reset_time > from_time:
            from_time = self.stats_reset_time

        links = 0
        for entry in self.crawled_links:
            if entry[0] > from_time:
                links += entry[2]
            elif entry[1] > from_time:
                links += int(entry[2] * (entry[1] - from_time) / (entry[1] - entry[0]))
        self.statistics_lock.release()

        ret = dict()
        ret['seconds'] = int(now - from_time)
        ret['links'] = links
        self.data_lock.acquire()
        ret['urls_per_min'] = self.urls_per_min
        self.data_lock.release()
        return ret

    def _clear_stats(self):
        """
        Removes statistics entries which are too old
        """
        self.statistics_lock.acquire()
        from_time = max(time.time() - KEEP_STATS_SECONDS, self.stats_reset_time)
        index = 0
        for i in range(len(self.crawled_links)):
            if self.crawled_links[i][0] > from_time:
                index = i
                break
        self.crawled_links = self.crawled_links[index:]
        self.stats_reset_time = from_time
        self.statistics_lock.release()

    def _efficiency_achieved(self):
        """
        Checks if crawled links number in the last time is above expectation.
        """
        stats = self._get_stats(CRAWLING_PERIOD)
        if stats['seconds'] > 0:
            return 60. * stats['links'] / stats['seconds'] >= stats['urls_per_min']
        return False