Exemple #1
0
    def __init__(self, stats, ignore_url=ignore_url):
        self.stats = stats
        if ignore_url is None:
            ignore_url = lambda url: False

        self.ignore_url = ignore_url

        self.urls = set()
        self.buckets = defaultdict(set)
        self.hosts = SortedKeyValue(key=itemgetter(1), value=itemgetter(0))
Exemple #2
0
def test_remove():
    s = SortedKeyValue(itemgetter(1), itemgetter(0))
    s.insert((0, 'en.wikipedia.org'))
    s.remove((0, 'en.wikipedia.org'))

    assert len(s) == 0
    assert len(s.keys) == 0
    assert len(s.values) == 0

    s.insert((20, 'en.wikipedia.org'))
    s.remove(('whatever', 'en.wikipedia.org'))

    assert len(s) == 0
    assert len(s.keys) == 0
    assert len(s.values) == 0
Exemple #3
0
def test_insert():
    s = SortedKeyValue(itemgetter(1), itemgetter(0))
    s.insert((0, 'en.wikipedia.org'))

    assert s.find_le(10)[1] == 'en.wikipedia.org'

    s.insert((10, 'en.wikipedia.org'))
    s.insert((20, 'en.wikipedia.org'))

    assert s.find_le(20)[1] == 'en.wikipedia.org'
    assert len(s) == 1
    assert len(s.keys) == 1
    assert len(s.values) == 1
Exemple #4
0
class URLFrontier(object):
    def __init__(self, stats, ignore_url=ignore_url):
        self.stats = stats
        if ignore_url is None:
            ignore_url = lambda url: False

        self.ignore_url = ignore_url

        self.urls = set()
        self.buckets = defaultdict(set)
        self.hosts = SortedKeyValue(key=itemgetter(1), value=itemgetter(0))

    def get_waittime(self, domain):
        return int(time.time()) + 10

    def add(self, origin, urls=None):
        hostname = urlparse(origin).hostname
        waittime = self.get_waittime(hostname)

        if not urls:
            urls = set([origin])

        if not isinstance(urls, set):
            urls = set([urls])

        new_urls = urls - self.urls

        if not self.ignore_url(origin):
            self.hosts.insert((waittime, hostname))

        for url in new_urls:
            hostname = urlparse(url).hostname or ''

            if self.ignore_url(url):
                continue

            self.buckets[hostname].add(url)
            if hostname not in self.hosts:
                self.hosts.insert((0, hostname))

        # log.debug('Found {} new urls'.format(len(new_urls)))
        self.urls.update(new_urls)
        self.stats['URLs frontier'] = len(self.urls)
        self.stats['hostnames'] = len(self.hosts)
        return waittime

    def pop(self):
        url = None
        while url is None:
            try:
                val, hostname = self.hosts.find_le(time.time())
            except ValueError:
                return

            waittime = self.get_waittime(hostname)
            self.hosts.insert((waittime, hostname))

            try:
                url = self.buckets[hostname].pop()
            except KeyError:
                del self.buckets[hostname]
                self.hosts.remove((0, hostname))

        return url