Beispiel #1
0
def test_remove_dupes(url):
    params = extract_params(url)
    validated_url = validate_url(url)
    validated_params = extract_params(validated_url)
    assert len(params) == len(validated_params)
    for p in params:
        assert (str(p) + "=" + str(params[p])) in validated_url
Beispiel #2
0
def test_remove_params(url, params_to_remove):
    validated_url = validate_url(url, params_to_remove)
    url_params = extract_params(url)
    for p in params_to_remove:
        if p in url_params:
            del url_params[p]
        else:
            pass
    validated_params = extract_params(validated_url)
    assert len(url_params) == len(validated_params)
    for u in url_params:
        assert u in validated_params
    def _deploy(self, job_id):
        """
        Deploy a spider to crawl the web. Use the DeploymentManager's enqueue
        method to specify which URLs to crawl. Depth should be assigned to each
        submitted URL prior to deployment.

        Arguments:
            job_id: intefer job id.

        Returns: None
        """

        if data.job_is_aborted(job_id):
            self._active = False
            self._queue = []
            return

        self._active = True
        queue_copy = self._queue[:]
        for index, url in enumerate(queue_copy):

            if data.job_is_aborted(job_id):
                break

            self._queue.remove(url)
            validated_url = validate_url(url)
            url = validated_url['url']
            webpage_info = data.get_webpage_info(url)

            if not claim(url):
                continue

            if not validated_url['valid']:
                continue

            # Ignore webpages crawled less than 15 min ago.
            if self._less_than_15_min_ago(webpage_info['completion_datetime']):
                continue

            # Database latency means depth is occasionally still unavailable.
            if not webpage_info['depth']:
                # Child URLs with no job_id and no depth have been deleted.
                if bool(data.redis.llen('reg:' + url)):
                    data.redis.set(url, 'ready')
                    self._queue.append(url)
                continue

            depth = webpage_info['depth'] - 1
            self._set_job_status(job_id, depth, index, len(queue_copy))
            self._fetch_and_parse(job_id, url, depth)
            time.sleep(self.delay)

        if data.job_is_aborted(job_id):
            self._active = False
            self._queue = []
        else:
            if len(self._queue):
                time.sleep(self.delay)
                self._deploy(job_id)
            else:
                self._set_job_status(job_id, -1, -1, 0, 'Complete')
                self._active = False
Beispiel #4
0
def test_convert_domain_suffix(url, expected):
    assert validate_url(url) == expected
Beispiel #5
0
def test_remove_non_existant_params():
    url = 'www.austintexas.gov?a=1&b=2&foo=bar&3=5&4=cats'
    validated_url = validate_url(url, ['cookie'])
    url_params = extract_params(url)
    validated_params = extract_params(validated_url)
    assert len(url_params) == len(validated_params)
Beispiel #6
0
def test_identity():
    assert validate_url("www.austintexas.gov") == "www.austintexas.gov"
Beispiel #7
0
 def handle_a(self, href):
     if href:
         validated_href = validate_url(href, self.url)
         if validated_href['valid']:
             target = urldefrag(urljoin(self.url, validated_href['url']))[0]
             self.hyperlinks.append(target)
    def _deploy(self, job_id):

        """
        Deploy a spider to crawl the web. Use the DeploymentManager's enqueue
        method to specify which URLs to crawl. Depth should be assigned to each
        submitted URL prior to deployment.

        Arguments:
            job_id: intefer job id.

        Returns: None
        """

        if data.job_is_aborted(job_id):
            self._active = False
            self._queue = []
            return

        self._active = True
        queue_copy = self._queue[:]
        for index, url in enumerate(queue_copy):

            if data.job_is_aborted(job_id):
                break

            self._queue.remove(url)
            validated_url = validate_url(url)
            url = validated_url['url']
            webpage_info = data.get_webpage_info(url)

            if not claim(url):
                continue

            if not validated_url['valid']:
                continue

            # Ignore webpages crawled less than 15 min ago.
            if self._less_than_15_min_ago(webpage_info['completion_datetime']):
                continue

            # Database latency means depth is occasionally still unavailable.
            if not webpage_info['depth']:
                # Child URLs with no job_id and no depth have been deleted.
                if bool(data.redis.llen('reg:' + url)):
                    data.redis.set(url, 'ready')
                    self._queue.append(url)
                continue

            depth = webpage_info['depth'] - 1
            self._set_job_status(job_id, depth, index, len(queue_copy))
            self._fetch_and_parse(job_id, url, depth)
            time.sleep(self.delay)

        if data.job_is_aborted(job_id):
            self._active = False
            self._queue = []
        else:
            if len(self._queue):
                time.sleep(self.delay)
                self._deploy(job_id)
            else:
                self._set_job_status(job_id, -1, -1, 0, 'Complete')
                self._active = False
Beispiel #9
0
 def handle_a(self, href):
     if href:
         validated_href = validate_url(href, self.url)
         if validated_href['valid']:
             target = urldefrag(urljoin(self.url, validated_href['url']))[0]
             self.hyperlinks.append(target)