Python JoinableQueue Beispiele, asyncio.JoinableQueue Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: urlcleaner.py Projekt: DXist/urlcleaner

    def __init__(self, urls, normalizer, result_saver=print,
                 qsize=None, result_qsize=None, num_workers=1,
                 max_tries=4, timeout=3, max_connections=30, *, loop=None):
        """Async URLCleaner.

        :param normalizer: callable that takes url and returns normalized url
        or False when url is invalid or None, when url can't be validated.

        """
        self.urls = urls
        self.normalizer = normalizer
        self.result_saver = result_saver

        self.loop = loop or asyncio.get_event_loop()
        self.q = Queue(maxsize=qsize or num_workers * 10, loop=self.loop)
        self.result_q = Queue(maxsize=result_qsize or num_workers * 10,
                              loop=self.loop)

        self.num_workers = num_workers
        self.max_tries = max_tries
        self.timeout = timeout
        proxy = os.environ.get('http_proxy')
        if proxy:
            self.connector = aiohttp.ProxyConnector(proxy=proxy,
                                                    limit=max_connections,
                                                    loop=self.loop)
        else:
            self.connector = aiohttp.TCPConnector(limit=max_connections,
                                                  loop=self.loop)

        self.t0 = time.time()
        self.t1 = None
        self.clean_task = None

Beispiel #2

0

Datei anzeigen

class Fetcher:
    def __init__(self, loop):
        self.num_worker = 10
        self.loop = loop
        self.q = Queue()
        self.seen_urls = set(['/'])

    async def manager(self):
        workers = [
            self.loop.create_task(self.worker())
            for _ in range(self.num_worker)
        ]
        # the `yield from` is not needed
        await self.q.put('/')
        # wait until q is empty
        await self.q.join()
        for w in workers:
            w.cancel()

    async def worker(self):
        while True:
            url = await self.q.get()

            sock = socket.socket(socket.AF_INET)
            sock.setblocking(False)
            try:
                await self.loop.sock_connect(sock, ('dilbert.com', 80))
            except BlockingIOError:
                pass

            request = 'GET {} HTTP/1.1\r\nHost: dilbert.com\r\nConnection: close\r\n\r\n'.format(
                url)
            await self.loop.sock_sendall(sock, request.encode('ascii'))

            response = b''
            chunk = await self.loop.sock_recv(sock, 4096)
            while chunk:
                response += chunk
                chunk = await self.loop.sock_recv(sock, 4096)

            links = await self.parse_link(response)
            for link in links.difference(self.seen_urls):
                await self.q.put(link)

            self.seen_urls.update(links)
            self.q.task_done()
            sock.close()

    async def parse_link(self, response):
        links = set([])
        d = pq(response)
        anchors = d("a")
        for anchor in anchors:
            href = anchor.get("href")
            if href and href[:5] == "http:" and href[7:14] == "dilbert":
                links.add(href[6:])
        return links

Beispiel #3

0

Datei anzeigen

Datei: crawling.py Projekt: wmz0001/crawler

 def __init__(self, roots,
              exclude=None, strict=True,  # What to crawl.
              max_redirect=10, max_tries=4,  # Per-url limits.
              max_tasks=10, *, loop=None):   # The lone * indicates that all following arguments are keyword-only arguments
     self.loop = loop or asyncio.get_event_loop()
     self.roots = roots
     self.exclude = exclude
     self.strict = strict
     self.max_redirect = max_redirect
     self.max_tries = max_tries
     self.max_tasks = max_tasks
     self.q = Queue(loop=self.loop)
     self.seen_urls = set()
     self.done = []
     self.session = aiohttp.ClientSession(loop=self.loop)
     self.root_domains = set()
     for root in roots:
         parts = urllib.parse.urlparse(root)
         host, port = urllib.parse.splitport(parts.netloc)
         if not host:
             continue
         if re.match(r'\A[\d\.]*\Z', host):  # \A and \Z are similar to ^ and $, \d represents the digital.(0.0.0.0)
             self.root_domains.add(host)
         else:
             host = host.lower()
             if self.strict:
                 self.root_domains.add(host)
             else:
                 self.root_domains.add(lenient_host(host))
     for root in roots:
         self.add_url(root)
     self.t0 = time.time()
     self.t1 = None

Beispiel #4

0

Datei anzeigen

Datei: asycio_clawering.py Projekt: weihhh/Leetcode_algorithm

 def __init__(self, roots,
              exclude=None, strict=True,  # What to crawl.
              max_redirect=10, max_tries=4,  # Per-url limits.
              max_tasks=10, *, loop=None):
     self.loop = loop or asyncio.get_event_loop()
     self.roots = roots
     self.exclude = exclude
     self.strict = strict
     self.max_redirect = max_redirect
     self.max_tries = max_tries
     self.max_tasks = max_tasks
     self.q = Queue(loop=self.loop)
     self.seen_urls = set()
     self.done = []
     self.session = aiohttp.ClientSession(loop=self.loop)
     self.root_domains = set()
     for root in roots:
         parts = urllib.parse.urlparse(root)#url字符串分解为6个元素，以元祖的形式返回,scheme://netloc/path;parameters?query#fragment,返回的对象还包含了一些属性：username、password、hostname、port.
         host, port = urllib.parse.splitport(parts.netloc)
         if not host:
             continue
         if re.match(r'\A[\d\.]*\Z', host):
             self.root_domains.add(host)#192.168.3.4这样的ip直接访问的
         else:
             host = host.lower()#域名不区分大小写
             if self.strict:
                 self.root_domains.add(host)
             else:
                 self.root_domains.add(lenient_host(host))#本意也许是只取顶级域名？奇怪的是lenient函数生成的为什么没有点
     for root in roots:
         self.add_url(root)
     self.t0 = time.time()
     self.t1 = None

Beispiel #5

0

Datei anzeigen

    def __init__(self, root, max_tasks=1000, loop=None, file=None):
        LOGGER.info('Starting Crawler ...\n')
        self.loop = loop or asyncio.get_event_loop()
        self.q = Queue(loop=self.loop)
        self.visited_urls = set()
        self.max_tasks = max_tasks
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()

        parts = urllib.parse.urlparse(root)
        host, port = urllib.parse.splitport(parts.netloc)
        if re.match(r'\A[\d\.]*\Z', host):
            self.root_domains.add(host)
        else:
            host = host.lower()
            self.root_domains.add(host)

        print('Hosts : {}'.format(','.join(self.root_domains)))

        self.add_url(root)

        self.t0 = time.time()
        self.t1 = None
        filename = '{}.csv'.format(file)
        self.f = open(filename, 'w')
        self.csv = csv.writer(self.f)
        self.csv.writerow(CSV_HEADER)

Beispiel #6

0

Datei anzeigen

 def __init__(self, roots,
              exclude=None, strict=True,  # What to crawl.
              max_redirect=10, max_tries=4,  # Per-url limits.
              max_tasks=10, *, loop=None):
     self.loop = loop or asyncio.get_event_loop()
     self.roots = roots
     self.exclude = exclude
     self.strict = strict
     self.max_redirect = max_redirect
     self.max_tries = max_tries
     self.max_tasks = max_tasks
     self.q = Queue(loop=self.loop)
     self.seen_urls = set()
     self.done = []
     self.session = aiohttp.ClientSession(loop=self.loop)
     self.root_domains = set()
     for root in roots:
         parts = urllib.parse.urlparse(root)
         host, port = urllib.parse.splitport(parts.netloc)
         if not host:
             continue
         if re.match(r'\A[\d\.]*\Z', host):
             self.root_domains.add(host)
         else:
             host = host.lower()
             if self.strict:
                 self.root_domains.add(host)
             else:
                 self.root_domains.add(lenient_host(host))
     for root in roots:
         self.add_url(root)
     self.t0 = time.time()
     self.t1 = None

Beispiel #7

0

Datei anzeigen

Datei: socket_fetch_asyncio.py Projekt: Tideorz/python_aio_demo

 def __init__(self, url, max_requests, loop, max_coroutines=100):
     self.url = url
     self.max_requests = max_requests
     self.links_visited = set()
     self.max_coroutines = max_coroutines
     self.queue = Queue()
     self.loop = loop

Beispiel #8

0

Datei anzeigen

 def __init__(self,
              targets=None,
              loop=None,
              configuration_reads=True,
              bus_timeout=2,
              iface=False,
              nat_mode=False):
     self.loop = loop or asyncio.get_event_loop()
     # q contains all KNXnet/IP gateways
     self.q = Queue(loop=self.loop)
     # bus_protocols is a list of all bus protocol instances for proper connection shutdown
     self.bus_protocols = []
     # knx_gateways is a list of KnxTargetReport objects, one for each found KNXnet/IP gateway
     self.knx_gateways = []
     self.t0 = time.time()
     self.t1 = None
     self.desc_timeout = None
     self.desc_retries = None
     self.knx_source = None
     self.configuration_reads = configuration_reads
     self.bus_timeout = bus_timeout
     self.iface = iface
     self.nat_mode = nat_mode
     if targets:
         self.set_targets(targets)
     else:
         self.targets = set()

Beispiel #9

0

Datei anzeigen

Datei: asyncio.py Projekt: fuyajun1983cn/myplayground

class Crawler:
    def __init__(self, root_url, max_redirect):
        self.max_tasks = 10
        self.max_redirect = max_redirect
        self.q = Queue()
        self.seen_urls = set()

        self.session = aiohttp.ClientSession(loop=loop)

        self.q.put((root_url, self.max_redirect))


    @asyncio.coroutine
    def crawl(self):
        workers = [asyncio.Task(self.work()),
                    for _ in range(self.max_tasks)]

Beispiel #10

0

Datei anzeigen

    def __init__(
            self,
            roots,
            exclude=None,
            strict=True,  # What to crawl.
            max_redirect=10,
            max_tries=4,  # Per-url limits.
            max_tasks=10,
            *,
            loop=None):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.count = 0
        self.seen_urls = set()
        self.good_urls = set()
        # if os.path.exists('seenurls'):
        #     with open('seenurls', 'r') as f:
        #         for line in f:
        #             self.seen_urls.add(json.loads(line))
        self.done = []
        # if os.path.exists('done'):
        #     with open('done', 'r') as f:
        #         for line in f:
        #             data = json.loads(line)
        #             self.record_statistic(FetchStatistic(url=data[0],
        #                                                  next_url=data[1],
        #                                                  status=data[2],
        #                                                  exception=data[3],
        #                                                  size=data[4],
        #                                                  content_type=data[5],
        #                                                  encoding=data[6],
        #                                                  num_urls=data[7],
        #                                                  num_new_urls=data[8]))
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))

        for root in roots:
            self.add_url(root)
        self.t0 = time.time()
        self.t1 = None
        self.saving = True

Beispiel #11

0

Datei anzeigen

Datei: core.py Projekt: pl77/iamine

    def __init__(self,
                 loop=None,
                 max_tasks=None,
                 retries=None,
                 secure=None,
                 hosts=None,
                 params=None,
                 config=None,
                 config_file=None,
                 access=None,
                 secret=None,
                 debug=None):

        # Set default values for kwargs.
        loop = asyncio.get_event_loop() if not loop else loop
        max_tasks = 100 if not max_tasks else max_tasks
        max_retries = 10 if not retries else retries
        protocol = 'http://' if not secure else 'https://'
        config = get_config(config, config_file)
        access = config.get('s3', {}).get('access', access)
        secret = config.get('s3', {}).get('secret', secret)
        debug = True if debug else False

        self.max_tasks = max_tasks
        self.max_retries = max_retries
        self.protocol = protocol
        self.hosts = hosts
        self.config = config
        self.access = access
        self.debug = debug
        self.cookies = config.get('cookies', {})

        # Asyncio/Aiohttp settings.
        self.connector = aiohttp.TCPConnector(share_cookies=True, loop=loop)
        self.connector.update_cookies(self.cookies)
        self.loop = loop
        self.q = Queue(1000, loop=self.loop)
        self.q = Queue(loop=self.loop)

        # Require valid access key!
        self.assert_s3_keys_valid(access, secret)

        # Rate limiting.
        self._max_per_second = self.get_global_rate_limit()
        self._min_interval = 1.0 / float(self._max_per_second)
        self._last_time_called = 0.0

Beispiel #12

0

Datei anzeigen

Datei: test.py Projekt: romandev/bible

class Crawler:
    def __init__(self, root_url, max_redirect):
        self.max_tasks = 10
        self.max_redirect = max_redirect
        self.q = Queue()
        self.seen_urls = set()

        self.session = aiohttp.ClientSession(loop=loop)

        self.q.put((root_url, self.max_redirect))

    @asyncio.coroutine
    def crawl(self):
        workers = [asyncio.Task(self.work()) for _ in range(self.max_tasks)]
        yield from self.q.join()
        for w in workers:
            w.cancel()

    @asyncio.coroutine
    def work(self):
        while True:
            url, max_direct = yield from self.q.get()
            yield from self.fetch(url, max_redirect)
            self.q.task_done(
            )  # 多线程：https://segmentfault.com/q/1010000009765115

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        response = yield from self.session.get(url, allow_redirects=False)

        try:
            if is_redirect(response):
                if max_redirect > 0:
                    next_url = response.headers['location']
                    if next_url in self.seen_urls:
                        return
                    self.seen_urls.add(next_url)
                    self.q.put_nowait((next_url, max_redirect - 1))
            else:
                links = yield from self.parse_links(response)
                for link in links.differenct(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            yield from response.release()

Beispiel #13

0

Datei anzeigen

Datei: test.py Projekt: romandev/bible

    def __init__(self, root_url, max_redirect):
        self.max_tasks = 10
        self.max_redirect = max_redirect
        self.q = Queue()
        self.seen_urls = set()

        self.session = aiohttp.ClientSession(loop=loop)

        self.q.put((root_url, self.max_redirect))

Beispiel #14

0

Datei anzeigen

Datei: asyncio.py Projekt: fuyajun1983cn/myplayground

    def __init__(self, root_url, max_redirect):
        self.max_tasks = 10
        self.max_redirect = max_redirect
        self.q = Queue()
        self.seen_urls = set()

        self.session = aiohttp.ClientSession(loop=loop)

        self.q.put((root_url, self.max_redirect))

Beispiel #15

0

Datei anzeigen

Datei: Spider.py Projekt: MosaicHe/cl_spider

 def __init__(self, max_tries=30, max_tasks=10, timeout=5,
              rootDir=os.getcwd()):
     self.max_tries = max_tries
     self.max_tasks = max_tasks
     self.loop = asyncio.get_event_loop()
     self.q = Queue(loop=self.loop)
     self.session = aiohttp.ClientSession(loop=self.loop)
     self.timeout = timeout
     self.rootDir = rootDir

Beispiel #16

0

Datei anzeigen

Datei: __init__.py Projekt: MVilstrup/swarm

    def __init__(self, max_tasks=20):
        self.max_tasks = max_tasks
        self.max_redirect = max_redirect
        self.q = Queue()

        # aiohttp's ClientSession does connection pooling and
        # HTTP keep-alives for us.
        loop = asyncio.get_event_loop()
        loop.run_until_complete(self.fetch())
        self.session = aiohttp.ClientSession(loop=loop)

Beispiel #17

0

Datei anzeigen

Datei: crawling.py Projekt: chtcvl/crawler

 def __init__(
         self,
         roots,
         exclude=None,
         strict=True,  # What to crawl.
         max_redirect=10,
         max_tries=4,  # Per-url limits.
         max_tasks=15,
         scrape_nonhtml=False,
         *,
         loop=None):
     self.loop = loop or asyncio.get_event_loop()
     self.roots = roots
     self.exclude = exclude
     self.strict = strict
     self.max_redirect = max_redirect
     self.max_tries = max_tries
     self.max_tasks = max_tasks
     self.request_timeout = 15
     self.q = Queue(loop=self.loop)
     self.seen_urls = set()
     self.done = []
     self.ua = fake_useragent.UserAgent()
     self.session = aiohttp.ClientSession(loop=self.loop)
     self.root_domains = set()
     self.scrape_nonhtml = scrape_nonhtml
     self.dbdsn = 'dbname=osint user=postgres host=127.0.0.1'
     self.dbinsertquery = 'INSERT INTO public.rawhtml' \
                          '(hostreversed, port, path, query, ctype, cdata, ctimestamp) ' \
                          'VALUES (%(hostreversed)s, %(port)s, %(path)s, %(query)s, ' \
                          '%(ctype)s, %(cdata)s, %(ctimestamp)s)'
     self.dbroots = dict()
     self.dnsroots = dict()
     for root in roots:
         parts = urllib.parse.urlparse(root)
         host, port = urllib.parse.splitport(parts.netloc)
         if not host:
             continue
         if re.match(r'\A[\d\.]*\Z', host):
             self.root_domains.add(host)
             self.dnsroots[host] = host
             self.dbroots[
                 host] = host  # TODO: get human readable form from DNS server
         else:
             host = host.lower()
             if self.strict:
                 self.root_domains.add(host)
                 self.dbroots[host] = '.'.join(reversed(host.split('.')))
             else:
                 self.root_domains.add(lenient_host(host))
     for root in roots:
         self.add_url(root)
     self.dbpool = None
     self.t0 = time.time()
     self.t1 = None

Beispiel #18

0

Datei anzeigen

Datei: asyncio_crawler.py Projekt: Dustyposa/goSpider

    def __init__(self, root_url: str, max_redirect: int):
        self.max_tasks = 10
        self.max_redirect = max_redirect
        self.q = Queue()
        self.seen_urls = set()

        # aiohttp 的 ClientSession 执行连接池 并且 HTTP 为我们 keep-alive
        self.session = aiohttp.ClientSession(loop=loop)

        # 把 (URL, max_redirect) 放入队列
        self.q.put((root_url, self.max_redirect))

Beispiel #19

0

Datei anzeigen

Datei: scrape.py Projekt: stelonix/hnewssimulator

def worker(get, queue: asyncio.JoinableQueue, output):
    while True:
        item = yield from queue.get()
        # This is horrible and I feel bad for writing it, believe me
        try:
            if item is None:
                return

            chunks, id = item

            for i in range(id, id + chunks):
                try:
                    data = yield from get("item/{}".format(i))
                    output(data)
                except Exception:
                    pass
        except Exception as e:
            pass
        finally:
            queue.task_done()

Beispiel #20

0

Datei anzeigen

Datei: crawling.py Projekt: schollz/pycrawler

    def __init__(self, roots,
                 exclude=None, strict=True,  # What to crawl.
                 max_redirect=10, max_tries=4,  # Per-url limits.
                 max_tasks=10, *, loop=None):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.count = 0
        self.seen_urls = set()
        self.good_urls = set()
        # if os.path.exists('seenurls'):
        #     with open('seenurls', 'r') as f:
        #         for line in f:
        #             self.seen_urls.add(json.loads(line))
        self.done = []
        # if os.path.exists('done'):
        #     with open('done', 'r') as f:
        #         for line in f:
        #             data = json.loads(line)
        #             self.record_statistic(FetchStatistic(url=data[0],
        #                                                  next_url=data[1],
        #                                                  status=data[2],
        #                                                  exception=data[3],
        #                                                  size=data[4],
        #                                                  content_type=data[5],
        #                                                  encoding=data[6],
        #                                                  num_urls=data[7],
        #                                                  num_new_urls=data[8]))
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))

        for root in roots:
            self.add_url(root)
        self.t0 = time.time()
        self.t1 = None
        self.saving = True

Beispiel #21

0

Datei anzeigen

Datei: crawler.py Projekt: wangwocg/algorithm

    def __init__(self, root_url, max_redirect):
        self.max_tasks = 10
        self.max_redirect = max_redirect
        self.q = Queue()
        self.seen_urls = set()

        # aiohttp's ClientSession does connection pooling and
        # HTTP keep-alives for us.
        self.session = aiohttp.ClientSession(loop=loop)

        # Put (URL, max_redirect) in the queue.
        self.q.put((root_url, self.max_redirect))
		
	@asyncio.coroutine
    def crawl(self):
        """Run the crawler until all work is done."""
        workers = [asyncio.Task(self.work())
                   for _ in range(self.max_tasks)]

        # When all work is done, exit.
        yield from self.q.join()
        for w in workers:
            w.cancel()
	@asyncio.coroutine
    def work(self):
        while True:
            url, max_redirect = yield from self.q.get()

            # Download page and add new links to self.q.
            yield from self.fetch(url, max_redirect)
            self.q.task_done()
	
	
# Begin fetching http://xkcd.com/353/
fetcher = Fetcher('/353/')
Task(fetcher.fetch())

loop = asyncio.get_event_loop()

crawler = crawling.Crawler('http://xkcd.com',
                           max_redirect=10)

loop.run_until_complete(crawler.crawl())

Beispiel #22

0

Datei anzeigen

Datei: Spider.py Projekt: wuxiongwei/cl_spider

 def __init__(self,
              max_tries=30,
              max_tasks=10,
              timeout=5,
              rootDir=os.getcwd()):
     self.max_tries = max_tries
     self.max_tasks = max_tasks
     self.loop = asyncio.get_event_loop()
     self.q = Queue(loop=self.loop)
     self.session = aiohttp.ClientSession(loop=self.loop)
     self.timeout = timeout
     self.rootDir = rootDir

Beispiel #23

0

Datei anzeigen

Datei: socket_fetch_asyncio.py Projekt: Tideorz/python_aio_demo

class WebCrawler(object):
    def __init__(self, url, max_requests, loop, max_coroutines=100):
        self.url = url
        self.max_requests = max_requests
        self.links_visited = set()
        self.max_coroutines = max_coroutines
        self.queue = Queue()
        self.loop = loop

    @asyncio.coroutine
    def work(self):
        while True:
            url = yield from self.queue.get()
            fetcher = Fetcher(url, self)
            yield from fetcher.connect()
            self.queue.task_done()

    @asyncio.coroutine
    def web_crawler(self):
        self.queue.put_nowait(self.url)
        self.session = aiohttp.ClientSession(loop=self.loop)
        workers = [
            asyncio.Task(self.work()) for _ in range(self.max_coroutines)
        ]
        yield from self.queue.join()
        for worker in workers:
            worker.cancel()
        yield from self.session.close()

Beispiel #24

0

Datei anzeigen

Datei: crawling.py Projekt: mtarsel/crawlski

    def __init__(self, roots,
                 exclude=None, strict=True,  # What to crawl.
                 max_redirect=10, max_tries=4,  # Per-url limits.
                 max_tasks=10, loop=None):

        get_domain(roots)

        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))

        with open(path, 'w') as temp_file:
            print('writing')
            temp_file.write('Domain name:')
            temp_file.write(roots)
            temp_file.write('\n \n')
            temp_file.close()

        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.connector = aiohttp.TCPConnector(loop=self.loop)
        self.root_domains = set()
#        for root in roots:
#            parts = urllib.parse.urlparse(root)
#            host, port = urllib.parse.splitport(parts.netloc)
#            if not host:
#                continue
#            if re.match(r'\A[\d\.]*\Z', host):
#                self.root_domains.add(host)
#            else:
##                host = host.lower()
#                if self.strict:
#                    self.root_domains.add(host)
#                else:
#                    self.root_domains.add(lenient_host(host))
#        for root in roots:
#            print("true root")
#            print(root)
#            self.add_url(root)
        self.add_url(roots)
        self.t0 = time.time()
        self.t1 = None

Beispiel #25

0

Datei anzeigen

Datei: AsyncBaseCrawler.py Projekt: fanzonezy/AmazonCrawler

 def __init__(self, task):
     self.seen_url = set()
     self.max_tasks = 50
     self.max_retry = 10
     self.loop = asyncio.get_event_loop()
     self.session = aiohttp.ClientSession(loop=self.loop)
     self.q = Queue(loop = self.loop)
     
     self.manager = Manager()
     
     self.q.put_nowait(task)
     
     """
     for debug
     """
     self.item_cnt = 0
     self.page_cnt = 0
     self.f = open("D:\\Acer", 'w')
     print('initialization finished.')

Beispiel #26

0

Datei anzeigen

    def __init__(self, loop, pre_url=None, pre_payload=None, target_url=None, login=None, payload_model=None, wordlist=None, error_string=None, tasks=64, tor=None, tor_address=None, debug=None):
        self.max_tasks = tasks
        self.queue = Queue()
        self.pre_url = pre_url
        self.pre_payload = self._generate_payload_type(pre_payload)
        self.attack_url = target_url
        self.login = login
        self.error_string = [x.strip() for x in error_string.split(',')]
        self.payload = self._generate_payload_type(payload_model)
        self.wordlist = wordlist
        self.found = Event()
        self.tor_use = tor
        #self.session = self._generate_new_session(loop)
        self.debug = debug
        self.runned_passwords = set()
        self.old_passwds = set()
        self.restore_files = []
        self.progress_bar = None
        self.ua = self._prepare_user_agents()
        self.start_time = time.time()
        self.last_report_time = time.time()

        # Statuses set of settings
        self.loaded_passwords = 0
        self.tried_passwords = 0
        self.error_passwords = 0
        self.max_passwords = 0

        # Tor set of settings
        if self.tor_use is not None and tor_address is not None:
            ip, port = parse_proxy_address(tor_address)
            self.tor_address = "http://{0}:{1}".format(ip, port)
            self.tor_address_string = tor_address

        # Session set of settings
        self.session_name = self._generate_session_name()
        restore_files = self._search_open_sesssion()
        if restore_files > 0:
            for file in self.restore_files:
                if self._load_old_session(file) is True:
                    break
        else:
            pass

Beispiel #27

0

Datei anzeigen

    def __init__(self,
                 start_date,
                 end_date,
                 max_tasks=10,
                 max_tries=10,
                 loop=None):
        self.start_date = start_date
        self.end_date = end_date
        self.max_tasks = max_tasks
        self.max_tries = max_tries

        self.loop = loop or asyncio.get_event_loop()
        self.session = aiohttp.ClientSession(loop=self.loop)

        self.q = Queue(loop=self.loop)

        self.t0 = time.time()
        self.t1 = None

        self.make_url_queue()

Beispiel #28

0

Datei anzeigen

Datei: fetcher.py Projekt: kevinshenyang07/MyCrawler

    def __init__(self,
                 loop,
                 root_urls,
                 url_filter,
                 max_tries=4,
                 max_redirects=10,
                 sleep_interval=0):
        self._loop = loop
        self._max_tries = max_tries
        self._max_redirects = max_redirects
        self._sleep_interval = sleep_interval

        self._session = None
        self._url_filter = url_filter

        # get queue ready
        self._url_queue = Queue(loop=loop)
        # add root URLs to URL queue
        for url in root_urls:
            self.add_a_task(url, 0, 0)

Beispiel #29

0

Datei anzeigen

Datei: crawling.py Projekt: zzkhaz/aspider

 def __init__(self, roots,
              # What to crawl.
              exclude=None, include=None, output=None, strict=True, count=None,
              proxy=None, max_redirect=10, max_tries=4,  # Per-url limits.
              max_tasks=10, loop=None, no_parse_links=False):
     self.loop = loop or asyncio.get_event_loop()
     self.roots = roots
     self.exclude = exclude
     self.include = include
     self.output = output
     self.count = int(count) if count else None
     self.strict = strict
     self.proxy = proxy
     self.max_redirect = max_redirect
     self.max_tries = max_tries
     self.max_tasks = max_tasks
     self.task_exit_counter = 0
     self.q = Queue(loop=self.loop)
     self.seen_urls = set()
     self.done = []
     self.session = aiohttp.ClientSession(loop=self.loop)
     self.root_domains = set()
     self.no_parse_links = no_parse_links
     for root in roots:
         parts = urllib.parse.urlparse(root)
         host, port = urllib.parse.splitport(parts.netloc)
         if not host:
             continue
         if re.match(r'\A[\d\.]*\Z', host):
             self.root_domains.add(host)
         else:
             host = host.lower()
             if self.strict:
                 self.root_domains.add(host)
             else:
                 self.root_domains.add(lenient_host(host))
     for root in roots:
         self.add_url(root)
     self.t0 = time.time()
     self.t1 = None
     self.output_file = self.get_file()

Beispiel #30

0

Datei anzeigen

Datei: crawling.py Projekt: 83286415/500lines

 def __init__(
         self,
         roots,
         exclude=None,
         strict=True,  # What to crawl.
         max_redirect=10,
         max_tries=4,  # Per-url limits.
         max_tasks=10,
         *,
         loop=None):
     self.loop = loop or asyncio.get_event_loop()
     self.roots = roots
     self.exclude = exclude
     self.strict = strict
     self.max_redirect = max_redirect
     self.max_tries = max_tries
     self.max_tasks = max_tasks
     self.q = Queue(loop=self.loop)  # url执行队列，使用put将url放入队列供爬虫爬取
     self.seen_urls = set()
     self.done = []  # 完成列表，每个元素是访问url后的具名元组FetchStatistic
     self.session = aiohttp.ClientSession(loop=self.loop)  # 单线程IO操作
     self.root_domains = set()
     for root in roots:
         parts = urllib.parse.urlparse(
             root)  # return 6 parts includes netloc(host+port)
         host, port = urllib.parse.splitport(
             parts.netloc)  # www.baidu.com, 80
         if not host:
             continue
         if re.match(r'\A[\d\.]*\Z', host):  # 如果url是全数字
             self.root_domains.add(host)
         else:
             host = host.lower()
             if self.strict:
                 self.root_domains.add(host)
             else:  # 省略www.
                 self.root_domains.add(lenient_host(host))
     for root in roots:
         self.add_url(root)  # add url to seen_urls set
     self.t0 = time.time()  # bgn time
     self.t1 = None  # end time

Beispiel #31

0

Datei anzeigen

Datei: core.py Projekt: wflk/knxmap

 def __init__(self, targets=None, max_workers=100, loop=None):
     self.loop = loop or asyncio.get_event_loop()
     # The number of concurrent workers for discovering KNXnet/IP gateways
     self.max_workers = max_workers
     # q contains all KNXnet/IP gateways
     self.q = Queue(loop=self.loop)
     # bus_queues is a dict containing a bus queue for each KNXnet/IP gateway
     self.bus_queues = dict()
     # bus_protocols is a list of all bus protocol instances for proper connection shutdown
     self.bus_protocols = list()
     # knx_gateways is a list of KnxTargetReport objects, one for each found KNXnet/IP gateway
     self.knx_gateways = list()
     # bus_devices is a list of KnxBusTargetReport objects, one for each found bus device
     self.bus_devices = set()
     self.bus_info = False
     self.t0 = time.time()
     self.t1 = None
     if targets:
         self.set_targets(targets)
     else:
         self.targets = set()

Beispiel #32

0

Datei anzeigen

Datei: core.py Projekt: takeshixx/knxmap

 def __init__(self, targets=None, max_workers=100, loop=None):
     self.loop = loop or asyncio.get_event_loop()
     # The number of concurrent workers for discovering KNXnet/IP gateways
     self.max_workers = max_workers
     # q contains all KNXnet/IP gateways
     self.q = Queue(loop=self.loop)
     # bus_queues is a dict containing a bus queue for each KNXnet/IP gateway
     self.bus_queues = dict()
     # bus_protocols is a list of all bus protocol instances for proper connection shutdown
     self.bus_protocols = list()
     # knx_gateways is a list of KnxTargetReport objects, one for each found KNXnet/IP gateway
     self.knx_gateways = list()
     # bus_devices is a list of KnxBusTargetReport objects, one for each found bus device
     self.bus_devices = set()
     self.bus_info = False
     self.t0 = time.time()
     self.t1 = None
     if targets:
         self.set_targets(targets)
     else:
         self.targets = set()

Beispiel #33

0

Datei anzeigen

Datei: crawling.py Projekt: t-fu/500lines

 def __init__(
     self,
     roots,
     exclude=None,
     strict=True,  # What to crawl.
     max_redirect=10,
     max_tries=4,  # Per-url limits.
     max_tasks=10,
     *,
     loop=None
 ):
     self.loop = loop or asyncio.get_event_loop()
     self.roots = roots
     self.exclude = exclude
     self.strict = strict
     self.max_redirect = max_redirect
     self.max_tries = max_tries
     self.max_tasks = max_tasks
     self.q = Queue(loop=self.loop)
     self.seen_urls = set()
     self.done = []
     self.connector = aiohttp.TCPConnector(loop=self.loop)
     self.root_domains = set()
     for root in roots:
         parts = urllib.parse.urlparse(root)
         host, port = urllib.parse.splitport(parts.netloc)
         if not host:
             continue
         if re.match(r"\A[\d\.]*\Z", host):
             self.root_domains.add(host)
         else:
             host = host.lower()
             if self.strict:
                 self.root_domains.add(host)
             else:
                 self.root_domains.add(lenient_host(host))
     for root in roots:
         self.add_url(root)
     self.t0 = time.time()
     self.t1 = None

Beispiel #34

0

Datei anzeigen

Datei: lc_crawling.py Projekt: misslio/lctools

	def __init__(self, roots, exclude = None, 
				strict = True, max_redirect = 10, 
				max_tries = 4, max_tasks = 10, * ,loop = None):
		self.loop = loop or asyncio.get_event_loop()
		self.roots = roots
		self.exclude = exclude
		self.strict = strict
		self.max_redirect = max_redirect
		self.max_tries = max_tries
		self.max_tasks = max_tasks
		self.q = Queue(loop = self.loop)
		self.seen_urls = set()
		self.done = []
		self.session = aiohttp.ClientSession(loop=self.loop)
		self.root_domains = set()
		print ('{}'.format(roots))
		for root in roots:
			parts = urllib.parse.urlparse(root)
			host, port = urllib.parse.splitport(parts.netloc)
			print ('host: %s, port %s'%(host,port))
			if not host:
				continue
			if re.match(r'\A[\d\.]*\Z', host): #match IP address
				self.root_domains.add(host)
			else: 
				host = host.lower()
				if self.strict:
					print ('no handled!')
					self.root_domains.add(host)
				else:
					print ('handled!')
					self.root_domains.add(lenient_host(host))

		for root in roots:
			self.add_url(root)
		self.t0 = time.time()
		self.t1 = None

Beispiel #35

0

Datei anzeigen

Datei: __init__.py Projekt: MVilstrup/swarm

class Fetcher(object):
    """
    Async Page fetcher, that c 

    """

    def __init__(self, max_tasks=20):
        self.max_tasks = max_tasks
        self.max_redirect = max_redirect
        self.q = Queue()

        # aiohttp's ClientSession does connection pooling and
        # HTTP keep-alives for us.
        loop = asyncio.get_event_loop()
        loop.run_until_complete(self.fetch())
        self.session = aiohttp.ClientSession(loop=loop)


    @asyncio.coroutine
    def fetch(self):
        """
        Run the fetcher until all work is done.
        """
        # Create workers that fetch pages
        workers = [asyncio.Task(self.work())
                   for _ in range(self.max_tasks / 2)]

        # Create seeders that takes URLs from redis and adds it to own queue
        seeders = [asyncio.Task(self.get_seeds())
                   for _ in range(self.max_tasks / 2)]

        # When all work is done, exit.
        yield from self.q.join()
        for s in seeders:
            s.cancel()
        for w in workers:
            w.cancel()

    @asyncio.coroutine
    def work(self):
        while True:
            # Get URLs from own queue
            url = yield from self.q.get()

            # Download page
            yield from self.fetch_url(url)
            self.q.task_done()

    @asyncio.coroutine
    def fetch_url(self, url):
        # Handle redirects ourselves.
        response = yield from self.session.get(
            url, allow_redirects=True)

        try:
            # Handle the reponse
            pass
        finally:
            # Return connection to pool.
            yield from response.release()


    @asyncio.coroutine
    def get_seeds(self):
        while True:
            pass

Beispiel #36

0

Datei anzeigen

Datei: crawling.py Projekt: wmz0001/crawler

class Crawler:
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    """
    def __init__(self, roots,
                 exclude=None, strict=True,  # What to crawl.
                 max_redirect=10, max_tries=4,  # Per-url limits.
                 max_tasks=10, *, loop=None):   # The lone * indicates that all following arguments are keyword-only arguments
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):  # \A and \Z are similar to ^ and $, \d represents the digital.(0.0.0.0)
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)
        self.t0 = time.time()
        self.t1 = None

    def close(self):
        """Close resources."""
        self.session.close()

    def host_okay(self, host):
        """Check if a host should be crawled.

        A literal match (after lowercasing) is always good.  For hosts
        that don't look like IP addresses, some approximate matches
        are okay depending on the strict flag.
        """
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        """Check if a host should be crawled, strict-ish version.

        This checks for equality modulo an initial 'www.' component.
        """
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        """Check if a host should be crawled, lenient version.

        This compares the last two components of the host.
        """
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    @asyncio.coroutine
    def parse_links(self, response):
        """Return a FetchStatistic and list of links."""
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text()

                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',
                                      text))
                if urls:
                    LOGGER.info('got %r distinct urls from %r',
                                len(urls), response.url)
                for url in urls:
                    normalized = urllib.parse.urljoin(response.url, url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stat = FetchStatistic(
            url=response.url,
            next_url=None,
            status=response.status,
            exception=None,
            size=len(body),
            content_type=content_type,
            encoding=encoding,
            num_urls=len(links),
            num_new_urls=len(links - self.seen_urls))

        return stat, links

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        """Fetch one URL."""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = yield from self.session.get(
                    url, allow_redirects=False)

                if tries > 1:
                    LOGGER.info('try %r for %r success', tries, url)

                break
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r', tries, url, client_error)
                exception = client_error

            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            LOGGER.error('%r failed after %r tries',
                         url, self.max_tries)
            self.record_statistic(FetchStatistic(url=url,
                                                 next_url=None,
                                                 status=None,
                                                 exception=exception,
                                                 size=0,
                                                 content_type=None,
                                                 encoding=None,
                                                 num_urls=0,
                                                 num_new_urls=0))
            return

        try:
            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url, location)
                self.record_statistic(FetchStatistic(url=url,
                                                     next_url=next_url,
                                                     status=response.status,
                                                     exception=None,
                                                     size=0,
                                                     content_type=None,
                                                     encoding=None,
                                                     num_urls=0,
                                                     num_new_urls=0))

                if next_url in self.seen_urls:
                    return
                if max_redirect > 0:
                    LOGGER.info('redirect to %r from %r', next_url, url)
                    self.add_url(next_url, max_redirect - 1)
                else:
                    LOGGER.error('redirect limit reached for %r from %r',
                                 next_url, url)
            else:
                stat, links = yield from self.parse_links(response)
                self.record_statistic(stat)
                for link in links.difference(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            yield from response.release()

    @asyncio.coroutine
    def work(self):
        """Process queue items forever."""
        try:
            while True:
                url, max_redirect = yield from self.q.get()
                assert url in self.seen_urls
                yield from self.fetch(url, max_redirect)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            LOGGER.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            LOGGER.debug('skipping non-root host in %r', url)
            return False
        return True

    def add_url(self, url, max_redirect=None):
        """Add a URL to the queue if not seen before."""
        if max_redirect is None:
            max_redirect = self.max_redirect
        LOGGER.debug('adding %r %r', url, max_redirect)
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    @asyncio.coroutine
    def crawl(self):
        """Run the crawler until all finished."""
        workers = [asyncio.Task(self.work(), loop=self.loop)
                   for _ in range(self.max_tasks)]
        self.t0 = time.time()
        yield from self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()

Beispiel #37

0

Datei anzeigen

Datei: urlcleaner.py Projekt: DXist/urlcleaner

class URLCleaner:
    """Preprocess and clean urls."""
    def __init__(self, urls, normalizer, result_saver=print,
                 qsize=None, result_qsize=None, num_workers=1,
                 max_tries=4, timeout=3, max_connections=30, *, loop=None):
        """Async URLCleaner.

        :param normalizer: callable that takes url and returns normalized url
        or False when url is invalid or None, when url can't be validated.

        """
        self.urls = urls
        self.normalizer = normalizer
        self.result_saver = result_saver

        self.loop = loop or asyncio.get_event_loop()
        self.q = Queue(maxsize=qsize or num_workers * 10, loop=self.loop)
        self.result_q = Queue(maxsize=result_qsize or num_workers * 10,
                              loop=self.loop)

        self.num_workers = num_workers
        self.max_tries = max_tries
        self.timeout = timeout
        proxy = os.environ.get('http_proxy')
        if proxy:
            self.connector = aiohttp.ProxyConnector(proxy=proxy,
                                                    limit=max_connections,
                                                    loop=self.loop)
        else:
            self.connector = aiohttp.TCPConnector(limit=max_connections,
                                                  loop=self.loop)

        self.t0 = time.time()
        self.t1 = None
        self.clean_task = None

    def local_clean(self, url):
        local_clean_url = self.normalizer(url)
        if local_clean_url:
            status = 'LOCAL_OK'
        elif local_clean_url is False:
            status = 'LOCAL_INVALID'
            local_clean_url = None
        else:
            status = 'UNCLEANED'
        return URLStat(url=url, local_clean_url=local_clean_url,
                       remote_clean_url=None, status=status, http_code=None,
                       exception=None)

    @asyncio.coroutine
    def remote_clean(self, urlstat):
        """Check URL by HEAD probing it."""
        tries = 0
        exception = None
        url = urlstat.local_clean_url
        headers = {
            'Accept-Encoding': 'identity',
        }
        while tries < self.max_tries:
            try:
                response = yield from asyncio.wait_for(
                    aiohttp.request('head', url, allow_redirects=True,
                                    headers=headers,
                                    connector=self.connector, loop=self.loop),
                    self.timeout, loop=self.loop)
                response.close()

                if tries > 1:
                    logger.info('Try %r for %r success', tries, url)
                break

            except ValueError as error:
                # do not need to retry for these errors
                logger.info('For %r raised %s', url, error)
                tries = self.max_tries
                exception = error

            except aiohttp.HttpProcessingError as e:
                logger.error('Got http error for %r, exception %s', url, e)
                urlstat.http_code = e.code
                urlstat.status = 'REMOTE_ERROR'
                urlstat.exception = e
                return urlstat

            except (aiohttp.ClientError, asyncio.TimeoutError) as error:
                logger.info('Try %r for %r raised %s, %s', tries, url,
                            type(error), error)
                exception = error

            tries += 1
            yield from asyncio.sleep(0.1)
        else:
            # all tries failed
            logger.error('all tries for %r failed, exception %s', url,
                         exception)
            urlstat.status = 'REMOTE_ERROR'
            urlstat.exception = exception
            return urlstat

        urlstat.http_code = response.status

        if response.status == 200:
            remote_clean_url = self.normalizer(response.url)
            if remote_clean_url:
                urlstat.status = 'REMOTE_OK'
                urlstat.remote_clean_url = remote_clean_url
            elif remote_clean_url is False:
                urlstat.status = 'REMOTE_INVALID'
            else:
                # url requires authorization, can't clean
                urlstat.status = 'UNCLEANED'
        else:
            urlstat.status = 'REMOTE_INVALID'

        return urlstat

    @asyncio.coroutine
    def process_url(self, url):
        urlstat = self.local_clean(url)
        if urlstat.status == 'LOCAL_OK':
            urlstat = yield from self.remote_clean(urlstat)
        return urlstat

    def close(self):
        """Close resources."""
        self.connector.close()

    @asyncio.coroutine
    def save_results(self):
        """Save cleaned URLStat."""
        while True:
            urlstat = yield from self.result_q.get()
            try:
                self.result_saver(urlstat)
            except StopIteration:
                self.cancel()

            except Exception as e: # noqa
                logger.exception(e)

            self.result_q.task_done()

    @asyncio.coroutine
    def work(self):
        """Process queue items forever."""
        while True:
            url = yield from self.q.get()
            urlstat = yield from self.process_url(url)
            self.q.task_done()
            yield from self.result_q.put(urlstat)

    @asyncio.coroutine
    def _clean(self):
        try:
            self.consumer = asyncio.Task(self.save_results(), loop=self.loop)
            self.workers = [asyncio.Task(self.work(), loop=self.loop) for _ in
                            range(self.num_workers)]
            self.t0 = time.time()

            for url in self.urls:
                yield from self.q.put(url)

            yield from self.q.join()
            yield from self.result_q.join()

            self.t1 = time.time()
            logger.debug('Cleaning time %.2f seconds', self.t1 - self.t0)
            self.cancel()

        finally:
            self.close()

    def clean(self):
        """Run the cleaner until all finished."""
        self.clean_task = asyncio.async(self._clean(), loop=self.loop)
        return self.clean_task

    def cancel(self):
        self.consumer.cancel()
        for w in self.workers:
            w.cancel()

        self.clean_task.cancel()

Beispiel #38

0

Datei anzeigen

Datei: Spider.py Projekt: wuxiongwei/cl_spider

class Spider:
    def __init__(self,
                 max_tries=30,
                 max_tasks=10,
                 timeout=5,
                 rootDir=os.getcwd()):
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.loop = asyncio.get_event_loop()
        self.q = Queue(loop=self.loop)
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.timeout = timeout
        self.rootDir = rootDir

    def close(self):
        self.session.close()

    def append_request(self, request):
        self.q.put_nowait(request)

    @asyncio.coroutine
    def _get_request(self):
        r = yield from self.q.get()
        return r

    @asyncio.coroutine
    def fetch(self, request_type, url, params, data):
        """Fetch one URL"""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                print("try %s---->%d times" % (url, tries))
                with aiohttp.Timeout(self.timeout):
                    response = yield from self.session.get(url, params=params)
                    if response.status == 200:
                        content_type = response.headers.get('content-type')
                        if content_type in CONTENT_TYPE_TEXT:
                            with aiohttp.Timeout(self.timeout):
                                content = yield from response.text(
                                    encoding='GBK')
                        else:
                            with aiohttp.Timeout(self.timeout):
                                content = yield from response.read()
                    break
            except asyncio.TimeoutError:
                print("timeout")
            except aiohttp.ClientError as client_error:
                print("client error")
            except Exception:
                print("unknown error")
            tries += 1
        else:
            print("try %s---->more than %d times, quit" % (url, tries))
            return None

        response.release()
        return content

    @asyncio.coroutine
    def _work(self):
        """Process queue items forever."""
        try:
            while True:
                r = yield from self._get_request()
                content = yield from self.fetch(r.request_type, r.url,
                                                r.params, r.data)
                if (content):
                    r.handle_func(content)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    @asyncio.coroutine
    def work(self):
        yield from self._work()

    @asyncio.coroutine
    def spider(self):
        """run  the spider until all finished"""
        workers = [
            asyncio.Task(self.work(), loop=self.loop)
            for _ in range(self.max_tasks)
        ]
        yield from self.q.join()

        for w in workers:
            w.cancel()

Beispiel #39

0

Datei anzeigen

 def __init__(self, loop):
     self.num_worker = 10
     self.loop = loop
     self.q = Queue()
     self.seen_urls = set(['/'])

Beispiel #40

0

Datei anzeigen

class Crawler:
    def __init__(self, roots, exclude=None, strict=True, max_redirect=10, max_tries=4, max_tasks=10, *, loop=None):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z]', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)
        self.t0 = time.time()
        self.t1 = None

    def close(self):
        self.session.close()

    def host_okay(self, host):
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        self.done.append(fetch_statistic)

    @asyncio.coroutine
    def parse_links(self, response):
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text()

                urls = set(re.findall(r"""(?i)href=["']?([^\s"'<>]+)""", text))
                if urls:
                    logger.info('got %r distinct urls from %r', len(urls), response.url)
                for url in urls:
                    normalized = urllib.parse.urljoin(response.url, url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stats = FetchStatistic(url=response.url, next_url=None, status=response.status, exception=None, size=len(body),
                               content_type=content_type, encoding=encoding, num_urls=len(links),
                               num_new_urls=len(links - self.seen_urls))
        return stats, links

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = yield from self.session.get(url, allow_redirects=False)
                if tries > 1:
                    logger.info('try %r for % success', tries, url)
                break
            except aiohttp.ClientError as client_error:
                logger.info('try %r for %r raised %r', tries, url, client_error)
                exception = client_error

            tries += 1

        else:
            logger.error('%r failed after %r tries', url, self.max_tries)

            self.record_statistic(FetchStatistic(url=url, next_url=None, status=None, exception=exception, size=0,
                                                 content_type=None, encoding=None, num_urls=0, num_new_urls=0))
            return

        try:
            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url, location)
                self.record_statistic(
                    FetchStatistic(url=url, next_url=next_url, status=response.status, exception=None, size=0,
                                   content_type=None, encoding=None, num_urls=0, num_new_urls=0))
                if next_url in self.seen_urls:
                    return
                if max_redirect > 0:
                    logger.info('redirect to %r from %r', next_url, url)
                    self.add_url(next_url, max_redirect - 1)
                else:
                    logger.error('redirect limit reached for &r from %r', next_url, url)
            else:
                stat, links = yield from self.parse_links(response)
                self.record_statistic(stat)
                for link in links.difference(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            pass

    @asyncio.coroutine
    def work(self):
        try:
            while True:
                url, max_redirect = yield from self.q.get()
                assert url in self.seen_urls
                yield from self.fetch(url, max_redirect)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            logger.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            logger.debug('skipping non-root host in %r', url)
            return False
        return True

    def add_url(self, url, max_redirect=None):
        if max_redirect is None:
            max_redirect = self.max_redirect
        logger.debug('adding %r %r', url, max_redirect)
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    def crawl(self):
        workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)]
        self.t0 = time.time()
        yield from self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()

Beispiel #41

0

Datei anzeigen

Datei: crawling.py Projekt: zzkhaz/aspider

class Crawler:
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    """

    def __init__(self, roots,
                 # What to crawl.
                 exclude=None, include=None, output=None, strict=True, count=None,
                 proxy=None, max_redirect=10, max_tries=4,  # Per-url limits.
                 max_tasks=10, loop=None, no_parse_links=False):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.include = include
        self.output = output
        self.count = int(count) if count else None
        self.strict = strict
        self.proxy = proxy
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.task_exit_counter = 0
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        self.no_parse_links = no_parse_links
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)
        self.t0 = time.time()
        self.t1 = None
        self.output_file = self.get_file()

    @asyncio.coroutine
    def close(self):
        """Close resources."""
        yield from self.session.close()

    def host_okay(self, host):
        """Check if a host should be crawled.

        A literal match (after lowercasing) is always good.  For hosts
        that don't look like IP addresses, some approximate matches
        are okay depending on the strict flag.
        """
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        """Check if a host should be crawled, strict-ish version.

        This checks for equality modulo an initial 'www.' component.
        """
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        """Check if a host should be crawled, lenient version.

        This compares the last two components of the host.
        """
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    def parse_text(self, url, text):
        '''
        call callback func on route
        '''
        route, args = router.match(url)
        if route:
            route.call(text, **args)

    @asyncio.coroutine
    def parse_links(self, response):
        """Return a FetchStatistic and list of links."""
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text(errors='ignore')

                # Replace href with (?:href|src) to follow image links.
                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',
                                      text))
                if urls:
                    logger.debug('got %r distinct urls from %r',
                                 len(urls), response.url)
                for url in urls:
                    normalized = urllib.parse.urljoin(str(response.url), url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

                # parse text
                self.parse_text(str(response.url), text)

                # do outing
                self.handle_output(str(response.url), text)

        stat = FetchStatistic(
            url=response.url,
            next_url=None,
            status=response.status,
            exception=None,
            size=len(body),
            content_type=content_type,
            encoding=encoding,
            num_urls=len(links),
            num_new_urls=len(links - self.seen_urls))

        return stat, links

    def handle_output(self, url, text):
        if self.output:
            d = self.parse_output(url, text)
            logger.info(f'write item: {url}')
            outputing.do_write(self.output, d, self.output_file)

    def parse_output(self, url, text):
        html = HTML(html=text)
        title_ele = html.find('title', first=True)
        d = OrderedDict()
        d['title'] = title_ele.text
        d['url'] = url
        d['datetime'] = now_time()
        d['text'] = text
        return d

    def get_file(self):
        '''
        generate a file name for output
        '''
        domains = list(self.root_domains)
        dt = datetime.datetime.now()
        dt_str = dt.strftime('%Y-%m-%d %H:%M:%S')
        f_name = f'{domains[0]}-{dt_str}'
        if self.output:
            if self.output == 'stream':
                return None
            f_name += f'.{self.output}'
        return f_name

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        """Fetch one URL."""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = yield from self.session.get(
                    url, allow_redirects=False, proxy=self.proxy)

                if tries > 1:
                    logger.info('try %r for %r success', tries, url)

                break
            except aiohttp.ClientError as client_error:
                logger.info('try %r for %r raised %r',
                            tries, url, client_error)
                exception = client_error

            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            logger.error('%r failed after %r tries',
                         url, self.max_tries)
            self.record_statistic(FetchStatistic(url=url,
                                                 next_url=None,
                                                 status=None,
                                                 exception=exception,
                                                 size=0,
                                                 content_type=None,
                                                 encoding=None,
                                                 num_urls=0,
                                                 num_new_urls=0))
            return

        try:
            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url, location)
                self.record_statistic(FetchStatistic(url=url,
                                                     next_url=next_url,
                                                     status=response.status,
                                                     exception=None,
                                                     size=0,
                                                     content_type=None,
                                                     encoding=None,
                                                     num_urls=0,
                                                     num_new_urls=0))

                if next_url in self.seen_urls:
                    return
                if max_redirect > 0:
                    logger.info('redirect to %r from %r', next_url, url)
                    self.add_url(next_url, max_redirect - 1)
                else:
                    logger.error('redirect limit reached for %r from %r',
                                 next_url, url)
            else:
                stat, links = yield from self.parse_links(response)
                self.record_statistic(stat)
                # disable parse links
                if not self.no_parse_links:
                    for link in links.difference(self.seen_urls):
                        # use router to verify links
                        if self.verify_url(link) or router.verify_url(link, url):
                            self.q.put_nowait((link, self.max_redirect))
                    self.seen_urls.update(links)
        except Exception as ex:
            logger.error(f'parse error: {url}')
            logger.exception(ex)
        finally:
            yield from asyncio.sleep(1)
            yield from response.release()

    @asyncio.coroutine
    def exit_on_empty_queue(self):
        if self.count and len(self.done) >= self.count:
            logger.warning(f'reach count: {self.count}, now quit')
            router.stop()

        if self.q.qsize() == 0:
            logger.warning('empty queue, now quit')
            yield from self.q.join()
            router.stop()

    @asyncio.coroutine
    def work(self):
        """Process queue items forever."""
        try:
            while router.is_running():
                url, max_redirect = yield from self.q.get()
                logger.debug(f'work on url {url}')
                assert url in self.seen_urls
                yield from self.fetch(url, max_redirect)
                self.q.task_done()
                yield from self.exit_on_empty_queue()

        except asyncio.CancelledError:
            logger.warning('canceling the worker')

    def url_allowed(self, url):
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            # logger.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            # logger.debug('skipping non-root host in %r', url)
            return False
        return True

    def verify_url(self, url):
        if self.include:
            for pattern in self.include:
                if re.search(pattern, url):
                    logger.debug(
                        f'{url} match include pattern: {pattern}, allowed')
                    return True
        if self.exclude and re.search(self.exclude, url):
            logger.debug(
                f'{url} match exclude pattern: {self.exclude}, rejected')
            return False
        # default False
        return False

    def add_url(self, url, max_redirect=None):
        """Add a URL to the queue if not seen before."""
        if max_redirect is None:
            max_redirect = self.max_redirect
        logger.debug('adding %r %r', url, max_redirect)
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    @asyncio.coroutine
    def crawl(self):
        """Run the crawler until all finished."""
        try:
            workers = [asyncio.Task(self.work(), loop=self.loop)
                       for _ in range(self.max_tasks)]
            self.t0 = time.time()
            # yield from asyncio.gather(*workers, loop=self.loop, return_exceptions=True)
            yield from router.quit_event.wait()
            for w in workers:
                w.cancel()
            self.t1 = time.time()
        except asyncio.CancelledError:
            logger.warning('canceling the crawler')
        finally:
            logger.warning('closing the crawler')
            yield from self.close()

Beispiel #42

0

Datei anzeigen

class Crawl:
    def __init__(self,
                 roots,
                 exclude=None,
                 strict=True,
                 base_url=None,
                 max_redirect=2,
                 max_tries=2,
                 max_tasks=1,
                 proxy=None,
                 *,
                 loop=None):
        self.base_url = base_url
        self.t0 = time.time()
        self.t1 = None
        self.strict = strict
        self.exclude = exclude
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.loop = loop or asyncio.get_event_loop()
        self.q = Queue(loop=self.loop)
        self.proxy = proxy
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        self.seen_urls = set()
        self.done = []
        for root in roots:
            parts = urlparse(root)
            host, port = splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)

    def add_url(self, url, max_redirect=None):
        if max_redirect is None:
            max_redirect = self.max_redirect
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    async def parse_links(self, response):
        links = set()
        content_type = None
        encoding = None
        price_link = []
        if response.status == 200:
            content_type = response.headers.get('content-type')
            response_url = str(response.url)
            if content_type in ('text/html', 'application/xml',
                                'text/html;charset=UTF-8'):
                pdict = {}
                if content_type:
                    content_type, pdict = cgi.parse_header(content_type)

                encoding = pdict.get('charset', 'utf-8')
                if content_type in ('text/html', 'application/xml'):
                    text = await response.text()
                    # print(text)
                    '''(?i)href=["']([^\s"'<>]+)'''
                    urls = set(
                        re.findall(
                            '<li style="margin-left: [-\d]+px">.*?<a href="(/s/ref=lp_\d+_nr_n_[\d+].*?)">.*?<span class="refinementLink">(.*?)</span>.*?</a>.*?</li>',
                            text, re.S | re.M))

                    if urls:
                        LOGGER.info('got %r distinct urls from %r', len(urls),
                                    response.url)
                    else:
                        for price_g in range(1, 100, 2):
                            low_price = price_g
                            high_price = price_g + 1
                            price_link.append(
                                "{}&low-price={}&high-price={}".format(
                                    response_url, low_price, high_price))

                    if len(price_link) > 0:
                        redis_server.lpush("price_link_tmp", *price_link)

                    for url in urls:
                        u, t = url
                        k = u.replace('&amp;', '&')
                        normalized = urljoin(str(response.url), k)
                        defragmented, frag = urldefrag(normalized)
                        if self.url_allowed(defragmented):
                            print(defragmented, t)
                            ''' Children's Books（儿童图书） General (科学通俗读物) 这两个陷入了回调.
                                INFO:__main__:redirect to 'https://www.amazon.cn/s/ref=lp_2084813051_nr_n_11/460-8646033-3118437?rh=n%3A2084813051&ie=UTF8' from 
                                'https://www.amazon.cn/s/ref=lp_2084813051_nr_n_11/460-8646033-3118437?fst=as%3Aoff&rh=n%3A658390051%2Cn%3A%21658391051%2Cn%3A2045366051%2Cn%3A2078652051%2Cn%3A2084813051%2Cn%3A2084839051&bbn=2084813051&ie=UTF8&qid=1511710241&rnid=2084813051'
                            '''
                            LOGGER.info = LOGGER.debug(
                                'previous url: %s, next url: %s, title: %s',
                                str(response.url), defragmented, t)
                            if t == "General (科学通俗读物)":
                                LOGGER.error("错误的分类: %r", t)
                            else:
                                links.add(defragmented)

        stat = FetchStatistic(url=response.url)
        return stat, links

    async def fetch(self, url, max_redirect):
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                headers = {
                    'User-Agent': FakeChromeUA.get_ua(),
                    'Accept-Encoding': 'gzip, deflate, sdch',
                    'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
                    'Accept':
                    'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                    'Connection': 'keep-alive'
                }
                response = await self.session.get(url,
                                                  allow_redirects=False,
                                                  headers=headers
                                                  )  #proxy=self.p,
                if tries > 1:
                    LOGGER.info = LOGGER.info('try %r for %r success', tries,
                                              url)
                break
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r', tries, url,
                            client_error)
                exception = client_error
            tries += 1
        else:
            LOGGER.error('%r failed after %r tries', url, self.max_tries)
            return

        try:
            if is_redirect(response):
                location = response.headers['location']
                next_url = urljoin(url, location)
                if next_url in self.seen_urls:
                    return
                if max_redirect > 0:
                    LOGGER.info('redirect to %r from %r', next_url, url)
                    self.add_url(next_url, max_redirect - 1)
                else:
                    LOGGER.error('redirect limit reached for %r from %r',
                                 next_url, url)
            else:
                # LOGGER.info('ubuntu ing')
                stat, links = await self.parse_links(response)
                self.record_statistic(stat)
                for link in links.difference(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            await response.release()

    async def work(self):
        try:
            while True:
                url, max_redirect = await self.q.get()
                assert url in self.seen_urls
                await self.fetch(url, max_redirect)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    def host_okay(self, host):
        """Check if a host should be crawled.

        A literal match (after lowercasing) is always good.  For hosts
        that don't look like IP addresses, some approximate matches
        are okay depending on the strict flag.
        """
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        """Check if a host should be crawled, strict-ish version.

        This checks for equality modulo an initial 'www.' component.
        """
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        """Check if a host should be crawled, lenient version.

        This compares the last two components of the host.
        """
        return lenient_host(host) in self.root_domains

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urlparse(url)
        if parts.scheme not in ('http', 'https'):
            LOGGER.debug('skipping non-http scheme in %r', url)
            return False
        host, port = splitport(parts.netloc)
        if not self.host_okay(host):
            LOGGER.debug('skipping non-root host in %r', url)
            return False
        return True

    async def crawl(self):
        """Run the crawler until all finished."""
        workers = [
            asyncio.Task(self.work(), loop=self.loop)
            for _ in range(self.max_tasks)
        ]
        self.t0 = time.time()
        await self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()

    def close(self):
        self.session.close()

    def check_result(self):
        print(self.root_domains)

    def __call__(self):
        print("__call__ function")
        print(self.root_domains)

Beispiel #43

0

Datei anzeigen

Datei: core.py Projekt: takeshixx/knxmap

class KnxMap:
    """The main scanner instance that takes care of scheduling workers for the targets."""

    def __init__(self, targets=None, max_workers=100, loop=None):
        self.loop = loop or asyncio.get_event_loop()
        # The number of concurrent workers for discovering KNXnet/IP gateways
        self.max_workers = max_workers
        # q contains all KNXnet/IP gateways
        self.q = Queue(loop=self.loop)
        # bus_queues is a dict containing a bus queue for each KNXnet/IP gateway
        self.bus_queues = dict()
        # bus_protocols is a list of all bus protocol instances for proper connection shutdown
        self.bus_protocols = list()
        # knx_gateways is a list of KnxTargetReport objects, one for each found KNXnet/IP gateway
        self.knx_gateways = list()
        # bus_devices is a list of KnxBusTargetReport objects, one for each found bus device
        self.bus_devices = set()
        self.bus_info = False
        self.t0 = time.time()
        self.t1 = None
        if targets:
            self.set_targets(targets)
        else:
            self.targets = set()

    def set_targets(self, targets):
        self.targets = targets
        for target in self.targets:
            self.add_target(target)

    def add_target(self, target):
        self.q.put_nowait(target)

    def add_bus_queue(self, gateway, bus_targets):
        self.bus_queues[gateway] = Queue(loop=self.loop)
        for target in bus_targets:
            self.bus_queues[gateway].put_nowait(target)
        return self.bus_queues[gateway]

    @asyncio.coroutine
    def bruteforce_auth_key(self, knx_gateway, target, full_key_space=False):
        if isinstance(target, set):
            target = list(target)[0]
        future = asyncio.Future()
        transport, protocol = yield from self.loop.create_datagram_endpoint(
            functools.partial(KnxTunnelConnection, future),
            remote_addr=(knx_gateway[0], knx_gateway[1]))
        self.bus_protocols.append(protocol)
        # Make sure the tunnel has been established
        connected = yield from future
        alive = yield from protocol.tpci_connect(target)
        if full_key_space:
            key_space = range(0, 0xffffffff)
        else:
            key_space = [0x11223344, 0x12345678, 0x00000000, 0x87654321, 0x11111111, 0xffffffff]
        # Bruteforce the key via A_Authorize_Request messages
        for key in key_space:
            access_level = yield from protocol.apci_authenticate(target, key)
            if access_level == 0:
                LOGGER.info("GOT THE KEY: {}".format(format(key, '08x')))
                break

    @asyncio.coroutine
    def knx_bus_worker(self, transport, protocol, queue):
        """A worker for communicating with devices on the bus."""
        try:
            while True:
                target = queue.get_nowait()
                LOGGER.info('BUS: target: {}'.format(target))
                if not protocol.tunnel_established:
                    LOGGER.error('KNX tunnel is not open!')
                    return

                alive = yield from protocol.tpci_connect(target)

                if alive:
                    properties = collections.OrderedDict()
                    serial = None

                    # DeviceDescriptorRead
                    descriptor = yield from protocol.apci_device_descriptor_read(target)
                    if not descriptor:
                        tunnel_request = protocol.make_tunnel_request(target)
                        tunnel_request.tpci_unnumbered_control_data('DISCONNECT')
                        protocol.send_data(tunnel_request.get_message(), target)
                        queue.task_done()
                        continue

                    if not self.bus_info:
                        t = KnxBusTargetReport(address=target)
                        self.bus_devices.add(t)
                        tunnel_request = protocol.make_tunnel_request(target)
                        tunnel_request.tpci_unnumbered_control_data('DISCONNECT')
                        protocol.send_data(tunnel_request.get_message(), target)
                        queue.task_done()
                        continue

                    dev_desc = struct.unpack('!H', descriptor)[0]
                    desc_medium, desc_type, desc_version = KnxMessage.parse_device_descriptor(dev_desc)

                    if desc_type > 1:
                        # Read System 2 and System 7 manufacturer ID object
                        manufacturer = yield from protocol.apci_property_value_read(
                            target,
                            property_id=DEVICE_OBJECTS.get('PID_MANUFACTURER_ID'))
                        if isinstance(manufacturer, (str, bytes)):
                            manufacturer = int.from_bytes(manufacturer, 'big')
                            manufacturer = get_manufacturer_by_id(manufacturer)

                        # Read the device state
                        device_state = yield from protocol.apci_memory_read(
                            target,
                            memory_address=0x0060)
                        if device_state:
                            properties['DEVICE_STATE'] = KnxMessage.unpack_cemi_runstate(
                                int.from_bytes(device_state, 'big'))

                        # Read the serial number object on System 2 and System 7 devices
                        serial = yield from protocol.apci_property_value_read(
                            target,
                            property_id=DEVICE_OBJECTS.get('PID_SERIAL_NUMBER'))
                        if isinstance(serial, (str, bytes)):
                            serial = codecs.encode(serial, 'hex').decode().upper()

                        for object_index, props in OBJECTS.items():
                            x = collections.OrderedDict()
                            for k, v in props.items():
                                ret = yield from protocol.apci_property_value_read(
                                    target,
                                    property_id=v,
                                    object_index=object_index)
                                if ret:
                                    x[k.replace('PID_', '')] = codecs.encode(ret, 'hex')
                            if x:
                                properties[OBJECT_TYPES.get(object_index)] = x

                    else:
                        # Try to MemoryRead the manufacturer ID on System 1 devices.
                        # Note: System 1 devices do not support access controls, so
                        # an authorization request is not needed.
                        manufacturer = yield from protocol.apci_memory_read(
                            target,
                            memory_address=0x0104,
                            read_count=1)
                        if isinstance(manufacturer, (str, bytes)):
                            manufacturer = int.from_bytes(manufacturer, 'big')
                            manufacturer = get_manufacturer_by_id(manufacturer)

                        device_state = yield from protocol.apci_memory_read(
                            target,
                            memory_address=0x0060)
                        if device_state:
                            properties['DEVICE_STATE'] = codecs.encode(device_state, 'hex')

                        ret = yield from protocol.apci_memory_read(
                            target,
                            memory_address=0x0105,
                            read_count=2)
                        if ret:
                            properties['DevTyp'] = codecs.encode(ret, 'hex')

                        ret = yield from protocol.apci_memory_read(
                            target,
                            memory_address=0x0101,
                            read_count=3)
                        if ret:
                            properties['ManData'] = codecs.encode(ret, 'hex')

                        ret = yield from protocol.apci_memory_read(
                            target,
                            memory_address=0x0108,
                            read_count=1)
                        if ret:
                            properties['CheckLim'] = codecs.encode(ret, 'hex')

                        ret = yield from protocol.apci_memory_read(
                            target,
                            memory_address=0x01FE,
                            read_count=1)
                        if ret:
                            properties['UsrPrg'] = codecs.encode(ret, 'hex')

                        ret = yield from protocol.apci_memory_read(
                            target,
                            memory_address=0x0116,
                            read_count=4)
                        if ret:
                            properties['AdrTab'] = codecs.encode(ret, 'hex')

                        start_addr = 0x0100
                        properties['EEPROM_DUMP'] = b''
                        for i in range(51):
                            ret = yield from protocol.apci_memory_read(
                                target,
                                memory_address=start_addr,
                                read_count=5)
                            if ret:
                                properties['EEPROM_DUMP'] += codecs.encode(ret, 'hex')
                            start_addr += 5

                    if descriptor:
                        t = KnxBusTargetReport(
                            address=target,
                            medium=desc_medium,
                            type=desc_type,
                            version=desc_version,
                            device_serial=serial,
                            manufacturer=manufacturer,
                            properties=properties)
                        self.bus_devices.add(t)

                    # Properly close the TPCI layer
                    yield from protocol.tpci_disconnect(target)

                queue.task_done()
        except asyncio.CancelledError:
            pass
        except asyncio.QueueEmpty:
            pass

    @asyncio.coroutine
    def bus_scan(self, knx_gateway, bus_targets):
        queue = self.add_bus_queue(knx_gateway.host, bus_targets)
        LOGGER.info('Scanning {} bus device(s) on {}'.format(queue.qsize(), knx_gateway.host))
        future = asyncio.Future()
        transport, bus_protocol = yield from self.loop.create_datagram_endpoint(
            functools.partial(
                KnxTunnelConnection,
                future,
                ndp_defer_time=self.bus_timeout,
                knx_source=self.knx_source),
            remote_addr=(knx_gateway.host, knx_gateway.port))
        self.bus_protocols.append(bus_protocol)

        # Make sure the tunnel has been established
        connected = yield from future
        if connected:
            workers = [asyncio.Task(self.knx_bus_worker(transport, bus_protocol, queue), loop=self.loop)]
            self.t0 = time.time()
            yield from queue.join()
            self.t1 = time.time()
            for w in workers:
                w.cancel()
            bus_protocol.knx_tunnel_disconnect()

        for i in self.bus_devices:
            knx_gateway.bus_devices.append(i)

        LOGGER.info('Bus scan took {} seconds'.format(self.t1 - self.t0))

    @asyncio.coroutine
    def knx_search_worker(self):
        """Send a KnxSearch request to see if target is a KNX device."""
        try:
            sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
            sock.setblocking(0)
            sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
            sock.setsockopt(socket.SOL_SOCKET, socket.SO_BINDTODEVICE, struct.pack('256s', str.encode(self.iface)))

            protocol = KnxGatewaySearch()
            waiter = asyncio.Future(loop=self.loop)
            transport = self.loop._make_datagram_transport(
                sock, protocol, ('224.0.23.12', 3671), waiter)

            try:
                # Wait until connection_made() has been called on the transport
                yield from waiter
            except:
                LOGGER.error('Creating multicast transport failed!')
                transport.close()
                return

            # Wait SEARCH_TIMEOUT seconds for responses to our multicast packets
            yield from asyncio.sleep(self.search_timeout)

            if protocol.responses:
                # If protocol received SEARCH_RESPONSE packets, print them
                for response in protocol.responses:
                    peer = response[0]
                    response = response[1]
                    t = KnxTargetReport(
                        host=peer[0],
                        port=peer[1],
                        mac_address=response.body.get('dib_dev_info').get('knx_mac_address'),
                        knx_address=response.body.get('dib_dev_info').get('knx_address'),
                        device_serial=response.body.get('dib_dev_info').get('knx_device_serial'),
                        friendly_name=response.body.get('dib_dev_info').get('device_friendly_name'),
                        device_status=response.body.get('dib_dev_info').get('device_status'),
                        knx_medium=response.body.get('dib_dev_info').get('knx_medium'),
                        project_install_identifier=response.body.get('dib_dev_info').get('project_install_identifier'),
                        supported_services=[
                            KNX_SERVICES[k] for k, v in
                            response.body.get('dib_supp_sv_families').get('families').items()],
                        bus_devices=[])

                    self.knx_gateways.append(t)
        except asyncio.CancelledError:
            pass

    @asyncio.coroutine
    def search_gateways(self):
        self.t0 = time.time()
        yield from asyncio.ensure_future(asyncio.Task(self.knx_search_worker(), loop=self.loop))
        self.t1 = time.time()
        LOGGER.info('Scan took {} seconds'.format(self.t1 - self.t0))

    @asyncio.coroutine
    def knx_description_worker(self):
        """Send a KnxDescription request to see if target is a KNX device."""
        try:
            while True:
                target = self.q.get_nowait()
                LOGGER.debug('Scanning {}'.format(target))
                for _try in range(self.desc_retries):
                    LOGGER.debug('Sending {}. KnxDescriptionRequest to {}'.format(_try, target))
                    future = asyncio.Future()
                    yield from self.loop.create_datagram_endpoint(
                        functools.partial(KnxGatewayDescription, future, timeout=self.desc_timeout),
                        remote_addr=target)
                    response = yield from future
                    if response:
                        break

                if response and isinstance(response, KnxDescriptionResponse):
                    t = KnxTargetReport(
                        host=target[0],
                        port=target[1],
                        mac_address=response.body.get('dib_dev_info').get('knx_mac_address'),
                        knx_address=response.body.get('dib_dev_info').get('knx_address'),
                        device_serial=response.body.get('dib_dev_info').get('knx_device_serial'),
                        friendly_name=response.body.get('dib_dev_info').get('device_friendly_name'),
                        device_status=response.body.get('dib_dev_info').get('device_status'),
                        knx_medium=response.body.get('dib_dev_info').get('knx_medium'),
                        project_install_identifier=response.body.get('dib_dev_info').get('project_install_identifier'),
                        supported_services=[
                            KNX_SERVICES[k] for k, v in
                            response.body.get('dib_supp_sv_families').get('families').items()],
                        bus_devices=[])

                    self.knx_gateways.append(t)
                self.q.task_done()
        except (asyncio.CancelledError, asyncio.QueueEmpty):
            pass

    @asyncio.coroutine
    def monitor(self, targets=None, group_monitor_mode=False):
        if targets:
            self.set_targets(targets)
        if group_monitor_mode:
            LOGGER.debug('Starting group monitor')
        else:
            LOGGER.debug('Starting bus monitor')
        future = asyncio.Future()
        transport, protocol = yield from self.loop.create_datagram_endpoint(
            functools.partial(KnxBusMonitor, future, group_monitor=group_monitor_mode),
            remote_addr=list(self.targets)[0])
        self.bus_protocols.append(protocol)
        yield from future
        if group_monitor_mode:
            LOGGER.debug('Starting group monitor')
        else:
            LOGGER.debug('Starting bus monitor')

    @asyncio.coroutine
    def search(self, search_timeout=5, iface=None):
        self.iface = iface
        self.search_timeout = search_timeout
        LOGGER.info('Make sure there are no filtering rules that drop UDP multicast packets!')
        yield from self.search_gateways()
        for t in self.knx_gateways:
            print_knx_target(t)
        LOGGER.info('Searching done')

    @asyncio.coroutine
    def brute(self, targets=None, bus_target=None, full_key_space=False):
        if targets:
            self.set_targets(targets)
        tasks = [asyncio.Task(self.bruteforce_auth_key(t, bus_target, full_key_space),
                              loop=self.loop) for t in self.targets]
        yield from asyncio.wait(tasks)

    @asyncio.coroutine
    def scan(self, targets=None, desc_timeout=2, desc_retries=2, bus_timeout=2,
             bus_targets=None, bus_info=False, knx_source=None, auth_key=0xffffffff):
        """The function that will be called by run_until_complete(). This is the main coroutine."""
        self.auth_key = auth_key
        if targets:
            self.set_targets(targets)

        self.desc_timeout = desc_timeout
        self.desc_retries = desc_retries
        self.bus_timeout = bus_timeout
        self.knx_source = knx_source
        workers = [asyncio.Task(self.knx_description_worker(), loop=self.loop)
                   for _ in range(self.max_workers if len(self.targets) > self.max_workers else len(self.targets))]
        self.t0 = time.time()
        yield from self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()

        if bus_targets and self.knx_gateways:
            self.bus_info = bus_info
            bus_scanners = [asyncio.Task(self.bus_scan(g, bus_targets), loop=self.loop) for g in self.knx_gateways]
            yield from asyncio.wait(bus_scanners)
        else:
            LOGGER.info('Scan took {} seconds'.format(self.t1 - self.t0))

        for t in self.knx_gateways:
            print_knx_target(t)

    @asyncio.coroutine
    def group_writer(self, target, value=0, routing=False, desc_timeout=2,
                     desc_retries=2, iface=False):
        self.desc_timeout = desc_timeout
        self.desc_retries = desc_retries
        self.iface = iface
        workers = [asyncio.Task(self.knx_description_worker(), loop=self.loop)
                   for _ in range(self.max_workers if len(self.targets) > self.max_workers else len(self.targets))]
        self.t0 = time.time()
        yield from self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()

        if self.knx_gateways:
            # TODO: make sure only a single gateway is supplied
            knx_gateway = self.knx_gateways[0]
        else:
            LOGGER.error('No valid KNX gateway found')
            return

        if routing:
            # Use KNX Routing to write group values
            if 'KNXnet/IP Routing' not in knx_gateway.supported_services:
                LOGGER.error('KNX gateway {gateway} does not support Routing'.format(
                    gateway=knx_gateway.host))

            sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
            sock.setblocking(0)
            sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
            sock.setsockopt(socket.SOL_SOCKET, socket.SO_BINDTODEVICE, struct.pack('256s', str.encode(self.iface)))

            # TODO: what if we have devices that access more advanced payloads?
            if isinstance(value, str):
                value = int(value)
            protocol = KnxRoutingConnection(target=target, value=value)
            waiter = asyncio.Future(loop=self.loop)
            transport = self.loop._make_datagram_transport(
                sock, protocol, ('224.0.23.12', 3671), waiter)

            try:
                # Wait until connection_made() has been called on the transport
                yield from waiter
            except:
                LOGGER.error('Creating multicast transport failed!')
                transport.close()
                return

        else:
            # Use KNX Tunnelling to write group values
            if 'KNXnet/IP Tunnelling' not in knx_gateway.supported_services:
                LOGGER.error('KNX gateway {gateway} does not support Routing'.format(
                    gateway=knx_gateway.host))

            future = asyncio.Future()
            transport, protocol = yield from self.loop.create_datagram_endpoint(
                functools.partial(KnxTunnelConnection, future),
                remote_addr=(knx_gateway.host, knx_gateway.port))
            self.bus_protocols.append(protocol)

            # Make sure the tunnel has been established
            connected = yield from future

            if connected:
                # TODO: what if we have devices that access more advanced payloads?
                if isinstance(value, str):
                    value = int(value)
                yield from protocol.apci_group_value_write(target, value=value)
                protocol.knx_tunnel_disconnect()


    @asyncio.coroutine
    def apci(self, target, desc_timeout=2, desc_retries=2, iface=False, args=None):
        self.desc_timeout = desc_timeout
        self.desc_retries = desc_retries
        self.iface = iface
        self.knx_source = args.knx_source
        workers = [asyncio.Task(self.knx_description_worker(), loop=self.loop)
                   for _ in range(self.max_workers if len(self.targets) > self.max_workers else len(self.targets))]
        self.t0 = time.time()
        yield from self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()

        if self.knx_gateways:
            # TODO: make sure only a single gateway is supplied
            knx_gateway = self.knx_gateways[0]
        else:
            LOGGER.error('No valid KNX gateway found')
            return

        # Use KNX Tunnelling to write group values
        if 'KNXnet/IP Tunnelling' not in knx_gateway.supported_services:
            LOGGER.error('KNX gateway {gateway} does not support Routing'.format(
                gateway=knx_gateway.host))

        future = asyncio.Future()
        transport, protocol = yield from self.loop.create_datagram_endpoint(
            functools.partial(KnxTunnelConnection, future, knx_source=self.knx_source),
            remote_addr=(knx_gateway.host, knx_gateway.port))
        self.bus_protocols.append(protocol)

        # Make sure the tunnel has been established
        connected = yield from future

        if connected:
            if args.apci_type == 'Memory_Read':
                alive = yield from protocol.tpci_connect(target)
                if alive:
                    dev_type = yield from protocol.get_device_type(target)
                    if not dev_type:
                        protocol.knx_tunnel_disconnect()
                        protocol.tpci_disconnect(target)
                        return
                    if dev_type > 1 and not args.ignore_auth:
                        auth_key = args.auth_key
                        if not isinstance(auth_key, int):
                            try:
                                auth_key = int(auth_key, 16)
                            except ValueError:
                                LOGGER.error('Invalid property ID')
                                protocol.knx_tunnel_disconnect()
                                protocol.tpci_disconnect(target)
                                return
                        auth_level = yield from protocol.apci_authenticate(
                            target,
                            key=auth_key)
                        if auth_level > 0:
                            LOGGER.error('Invalid authentication key')
                            protocol.knx_tunnel_disconnect()
                            protocol.tpci_disconnect(target)
                            return
                    memory_address = args.memory_address
                    if not isinstance(memory_address, int):
                        try:
                            memory_address = int(memory_address, 16)
                        except ValueError:
                            LOGGER.error('Invalid property ID')
                            protocol.knx_tunnel_disconnect()
                            protocol.tpci_disconnect(target)
                            return
                    data = yield from protocol.apci_memory_read(
                        target,
                        memory_address=memory_address,
                        read_count=args.read_count)
                    protocol.tpci_disconnect(target)
                    if not data:
                        LOGGER.debug('No data received')
                    else:
                        LOGGER.info(codecs.encode(data, 'hex'))
            elif args.apci_type == 'Memory_Write':
                alive = yield from protocol.tpci_connect(target)
                if alive:
                    dev_type = yield from protocol.get_device_type(target)
                    if not dev_type:
                        protocol.knx_tunnel_disconnect()
                        protocol.tpci_disconnect(target)
                        return
                    if dev_type > 1:
                        auth_key = args.auth_key
                        if not isinstance(auth_key, int):
                            try:
                                auth_key = int(auth_key, 16)
                            except ValueError:
                                LOGGER.error('Invalid property ID')
                                protocol.knx_tunnel_disconnect()
                                protocol.tpci_disconnect(target)
                                return
                        auth_level = yield from protocol.apci_authenticate(
                            target,
                            key=auth_key)
                        if auth_level > 0:
                            LOGGER.error('Invalid authentication key')
                            protocol.knx_tunnel_disconnect()
                            protocol.tpci_disconnect(target)
                            return
                    memory_address = args.memory_address
                    memory_data = args.memory_data
                    if not isinstance(memory_address, int) or \
                            not isinstance(memory_data, bytes):
                        try:
                            memory_address = int(memory_address, 16)
                            memory_data = codecs.decode(memory_data, 'hex')
                        except ValueError:
                            LOGGER.error('Invalid property ID or write data')
                            protocol.knx_tunnel_disconnect()
                            protocol.tpci_disconnect(target)
                            return
                    data = yield from protocol.apci_memory_write(
                        target,
                        memory_address=memory_address,
                        write_count=args.read_count,
                        data=memory_data)
                    protocol.tpci_disconnect(target)
                    if not data:
                        LOGGER.debug('No data received')
                    else:
                        LOGGER.info(codecs.encode(data, 'hex'))
            elif args.apci_type == 'Key_Write':
                alive = yield from protocol.tpci_connect(target)
                if alive:
                    dev_type = yield from protocol.get_device_type(target)
                    if not dev_type:
                        protocol.knx_tunnel_disconnect()
                        protocol.tpci_disconnect(target)
                        return
                    if dev_type > 1:
                        auth_key = args.auth_key
                        if not isinstance(auth_key, int):
                            try:
                                auth_key = int(auth_key, 16)
                            except ValueError:
                                LOGGER.error('Invalid property ID')
                                protocol.knx_tunnel_disconnect()
                                protocol.tpci_disconnect(target)
                                return
                        auth_level = yield from protocol.apci_authenticate(
                            target,
                            key=auth_key)
                        if auth_level > 0:
                            LOGGER.error('Invalid authentication key')
                            protocol.knx_tunnel_disconnect()
                            protocol.tpci_disconnect(target)
                            return
                    new_auth_key = args.new_auth_key
                    if not isinstance(new_auth_key, int):
                        try:
                            new_auth_key = int(new_auth_key, 16)
                        except ValueError:
                            LOGGER.error('Invalid property ID')
                            protocol.knx_tunnel_disconnect()
                            protocol.tpci_disconnect(target)
                            return
                    data = yield from protocol.apci_key_write(
                        target,
                        level=args.auth_level,
                        key=new_auth_key)
                    protocol.tpci_disconnect(target)
                    if not data:
                        LOGGER.debug('No data received')
                    else:
                        LOGGER.info('Authorization level: {}'.format(data))
            elif args.apci_type == 'PropertyValue_Read':
                property_id = args.property_id
                if not isinstance(property_id, int):
                    try:
                        property_id = int(property_id, 16)
                    except ValueError:
                        LOGGER.error('Invalid property ID')
                        protocol.knx_tunnel_disconnect()
                        protocol.tpci_disconnect(target)
                        return
                alive = yield from protocol.tpci_connect(target)
                if alive:
                    data = yield from protocol.apci_property_value_read(
                        target,
                        object_index=args.object_index,
                        property_id=property_id,
                        num_elements=args.num_elements,
                        start_index=args.start_index)
                    protocol.tpci_disconnect(target)
                    if not data:
                        LOGGER.debug('No data received')
                    else:
                        LOGGER.info(codecs.encode(data, 'hex'))
            elif args.apci_type == 'DeviceDescriptor_Read':
                alive = yield from protocol.tpci_connect(target)
                if alive:
                    data = yield from protocol.apci_device_descriptor_read(target)
                    protocol.tpci_disconnect(target)
                    if not data:
                        LOGGER.debug('No data received')
                    else:
                        LOGGER.info(codecs.encode(data, 'hex'))
            elif args.apci_type == 'Authorize':
                auth_key = args.auth_key
                if not isinstance(auth_key, int):
                    try:
                        auth_key = int(auth_key, 16)
                    except ValueError:
                        LOGGER.error('Invalid property ID')
                        protocol.knx_tunnel_disconnect()
                        protocol.tpci_disconnect(target)
                        return
                alive = yield from protocol.tpci_connect(target)
                if alive:
                    data = yield from protocol.apci_authenticate(
                        target,
                        key=auth_key)
                    protocol.tpci_disconnect(target)
                    if isinstance(data, (type(None), type(False))):
                        LOGGER.debug('No data received')
                    else:
                        LOGGER.info('Authorization level: {}'.format(data))
            elif args.apci_type == 'IndividualAddress_Read':
                alive = yield from protocol.tpci_connect(target)
                if alive:
                    data = yield from protocol.apci_individual_address_read(target)
                    protocol.tpci_disconnect(target)
                    if isinstance(data, (type(None), type(False))):
                        LOGGER.debug('No data received')
                    else:
                        LOGGER.info('Individual address: {}'.format(data))
            elif args.apci_type == 'UserManufacturerInfo_Read':
                alive = yield from protocol.tpci_connect(target)
                if alive:
                    data = yield from protocol.apci_user_manufacturer_info_read(target)
                    protocol.tpci_disconnect(target)
                    if isinstance(data, (type(None), type(False))):
                        LOGGER.debug('No data received')
                    else:
                        LOGGER.info(codecs.encode(data, 'hex'))
            elif args.apci_type == 'Restart':
                alive = yield from protocol.tpci_connect(target)
                if alive:
                    yield from protocol.apci_restart(target)
                    protocol.tpci_disconnect(target)
            elif args.apci_type == 'Progmode':
                alive = yield from protocol.tpci_connect(target)
                if alive:
                    dev_type = yield from protocol.get_device_type(target)
                    if not dev_type:
                        protocol.knx_tunnel_disconnect()
                        protocol.tpci_disconnect(target)
                        return
                    if dev_type > 1:
                        auth_key = args.auth_key
                        if not isinstance(auth_key, int):
                            try:
                                auth_key = int(auth_key, 16)
                            except ValueError:
                                LOGGER.error('Invalid property ID')
                                protocol.knx_tunnel_disconnect()
                                protocol.tpci_disconnect(target)
                                return
                        auth_level = yield from protocol.apci_authenticate(
                            target,
                            key=auth_key)
                        if auth_level > 0:
                            LOGGER.error('Invalid authentication key')
                            protocol.knx_tunnel_disconnect()
                            protocol.tpci_disconnect(target)
                            return
                    data = yield from protocol.apci_memory_read(
                        target,
                        memory_address=0x0060,
                        read_count=args.read_count)
                    if not data:
                        LOGGER.debug('No data received')
                    else:
                        data = int.from_bytes(data, 'big')
                        run_state = KnxMessage.unpack_cemi_runstate(data)
                        if args.toggle:
                            if run_state.get('PROG_MODE'):
                                run_state = KnxMessage.pack_cemi_runstate(
                                    prog_mode=False,
                                    link_layer_active=run_state.get('LINK_LAYER'),
                                    transport_layer_active=run_state.get('TRANSPORT_LAYER'),
                                    app_layer_active=run_state.get('APP_LAYER'),
                                    serial_interface_active=run_state.get('SERIAL_INTERFACE'),
                                    user_app_run=run_state.get('USER_APP'),
                                    bcu_download_mode=run_state.get('BC_DM'))
                            else:
                                run_state = KnxMessage.pack_cemi_runstate(
                                    prog_mode=True,
                                    link_layer_active=run_state.get('LINK_LAYER'),
                                    transport_layer_active=run_state.get('TRANSPORT_LAYER'),
                                    app_layer_active=run_state.get('APP_LAYER'),
                                    serial_interface_active=run_state.get('SERIAL_INTERFACE'),
                                    user_app_run=run_state.get('USER_APP'),
                                    bcu_download_mode=run_state.get('BC_DM'))
                            data = yield from protocol.apci_memory_write(
                                target,
                                memory_address=0x0060,
                                data=struct.pack('!B', run_state))
                            if not data:
                                LOGGER.debug('No data received')
                            else:
                                LOGGER.info(codecs.encode(data, 'hex'))
                        else:
                            if run_state.get('PROG_MODE'):
                                LOGGER.info('Programming mode ENABLED')
                            else:
                                LOGGER.info('Programming mode disabled')
                    protocol.tpci_disconnect(target)
            elif args.apci_type == 'GroupValue_Write':
                if not hasattr(args, 'value') or args.value is None:
                    LOGGER.error('Invalid parameters')
                    protocol.knx_tunnel_disconnect()
                    return
                if isinstance(args.value, str):
                    value = int(args.value)
                yield from protocol.apci_group_value_write(target, value=value)

            protocol.knx_tunnel_disconnect()

Beispiel #44

0

Datei anzeigen

Datei: crawler.py Projekt: meodien99/py-crawler

class Crawler:
    def __init__(self, roots,
                 exclude=None, strict=True,  # What to crawl.
                 max_redirect=10, max_tries=4,  # Per-url limits.
                 max_tasks=10, *, loop=None):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)
        self.t0 = time.time()
        self.t1 = None

    def close(self):
        self.session.close()

    def host_okay(self, host):
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        self.done.append(fetch_statistic)

    @asyncio.coroutine
    def parse_links(self, response):
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text()

                # Replace href with (?:href|src) to follow image links.
                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',
                                      text))
                if urls:
                    LOGGER.info('got %r distinct urls from %r',
                                len(urls), response.url)
                for url in urls:
                    normalized = urllib.parse.urljoin(response.url, url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stat = FetchStatistic(
            url=response.url,
            next_url=None,
            status=response.status,
            exception=None,
            size=len(body),
            content_type=content_type,
            encoding=encoding,
            num_urls=len(links),
            num_new_urls=len(links - self.seen_urls))

        return stat, links

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = yield from self.session.get(
                    url, allow_redirects=False)

                if tries > 1:
                    LOGGER.info('try %r for %r success', tries, url)

                break
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r', tries, url,
                            client_error)
                exception = client_error

            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            LOGGER.error('%r failed after %r tries',
                         url, self.max_tries)
            self.record_statistic(FetchStatistic(url=url,
                                                 next_url=None,
                                                 status=None,
                                                 exception=exception,
                                                 size=0,
                                                 content_type=None,
                                                 encoding=None,
                                                 num_urls=0,
                                                 num_new_urls=0))
            return

        try:
            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url, location)
                self.record_statistic(FetchStatistic(url=url,
                                                     next_url=next_url,
                                                     status=response.status,
                                                     exception=None,
                                                     size=0,
                                                     content_type=None,
                                                     encoding=None,
                                                     num_urls=0,
                                                     num_new_urls=0))

                if next_url in self.seen_urls:
                    return
                if max_redirect > 0:
                    LOGGER.info('redirect to %r from %r', next_url, url)
                    self.add_url(next_url, max_redirect - 1)
                else:
                    LOGGER.error('redirect limit reached for %r from %r',
                                 next_url, url)
            else:
                stat, links = yield from self.parse_links(response)
                self.record_statistic(stat)
                for link in links.difference(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            yield from response.release()

    @asyncio.coroutine
    def work(self):
        # Process queue items forever.
        try:
            while True:
                url, max_redirect = yield from self.q.get()
                assert url in self.seen_urls
                yield from self.fetch(url, max_redirect)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            LOGGER.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            LOGGER.debug('skipping non-root host in %r', url)
            return False
        return True

    def add_url(self, url, max_redirect=None):
        # Add a URL to the queue if not seen before.
        if max_redirect is None:
            max_redirect = self.max_redirect
        LOGGER.debug('adding %r %r', url, max_redirect)
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    @asyncio.coroutine
    def crawl(self):
        # Run the crawler until all finished.
        workers = [asyncio.Task(self.work(), loop=self.loop)
                   for _ in range(self.max_tasks)]
        self.t0 = time.time()
        yield from self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()

Beispiel #45

0

Datei anzeigen

class Crawler:
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    这里面有两个队列.seen_urls 和 done
    """
    # TODO xpath support
    # TODO uvloop support
    def __init__(self, roots,
                 exclude=None, strict=True,  # What to crawl.
                 max_redirect=10, max_tries=4,  # Per-url limits.
                 max_tasks=10, *, loop=None):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)
        self.t0 = time.time()
        self.t1 = None

    def close(self):
        """Close resources."""
        self.session.close()

    def host_okay(self, host):
        """Check if a host should be crawled.

        A literal match (after lowercasing) is always good.  For hosts
        that don't look like IP addresses, some approximate matches
        are okay depending on the strict flag.
        """
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        """Check if a host should be crawled, strict-ish version.

        This checks for equality modulo an initial 'www.' component.
        """
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        """Check if a host should be crawled, lenient version.

        This compares the last two components of the host.
        """
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    @asyncio.coroutine
    def parse_links(self, response):
        """Return a FetchStatistic and list of links."""
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text()

                # Replace href with (?:href|src) to follow image links.
                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',
                                      text))
                if urls:
                    LOGGER.info('got %r distinct urls from %r',
                                len(urls), response.url)
                for url in urls:
                    normalized = urllib.parse.urljoin(response.url, url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stat = FetchStatistic(
            url=response.url,
            next_url=None,
            status=response.status,
            exception=None,
            size=len(body),
            content_type=content_type,
            encoding=encoding,
            num_urls=len(links),
            num_new_urls=len(links - self.seen_urls))

        return stat, links

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        """Fetch one URL."""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = yield from self.session.get(
                    url, allow_redirects=False)

                if tries > 1:
                    LOGGER.info('try %r for %r success', tries, url)

                break
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r', tries, url, client_error)
                exception = client_error

            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            LOGGER.error('%r failed after %r tries',
                         url, self.max_tries)
            self.record_statistic(FetchStatistic(url=url,
                                                 next_url=None,
                                                 status=None,
                                                 exception=exception,
                                                 size=0,
                                                 content_type=None,
                                                 encoding=None,
                                                 num_urls=0,
                                                 num_new_urls=0))
            return

        try:
            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url, location)
                self.record_statistic(FetchStatistic(url=url,
                                                     next_url=next_url,
                                                     status=response.status,
                                                     exception=None,
                                                     size=0,
                                                     content_type=None,
                                                     encoding=None,
                                                     num_urls=0,
                                                     num_new_urls=0))

                if next_url in self.seen_urls:
                    return
                if max_redirect > 0:
                    LOGGER.info('redirect to %r from %r', next_url, url)
                    self.add_url(next_url, max_redirect - 1)
                else:
                    LOGGER.error('redirect limit reached for %r from %r',
                                 next_url, url)
            else:
                stat, links = yield from self.parse_links(response)
                self.record_statistic(stat)
                for link in links.difference(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            yield from response.release()

    @asyncio.coroutine
    def work(self):
        """Process queue items forever."""
        try:
            while True:
                url, max_redirect = yield from self.q.get()
                assert url in self.seen_urls
                yield from self.fetch(url, max_redirect)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            LOGGER.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            LOGGER.debug('skipping non-root host in %r', url)
            return False
        return True

    def add_url(self, url, max_redirect=None):
        """Add a URL to the queue if not seen before."""
        if max_redirect is None:
            max_redirect = self.max_redirect
        LOGGER.debug('adding %r %r', url, max_redirect)
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    @asyncio.coroutine
    def crawl(self):
        """Run the crawler until all finished."""
        workers = [asyncio.Task(self.work(), loop=self.loop)
                   for _ in range(self.max_tasks)]
        self.t0 = time.time()
        yield from self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()

Beispiel #46

0

Datei anzeigen

Datei: crawler.py Projekt: zhiyue/spin-bot

    def __init__(self,
                 roots,
                 exclude=None,
                 strict=True,
                 max_redirect=10,
                 proxy=None,
                 max_tries=4,
                 user_agents=None,
                 max_tasks=10,
                 time_out=15,
                 allowed_paths=None,
                 item_paths=None,
                 *,
                 loop=None):
        if not loop:
            asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
            self.loop = asyncio.get_event_loop()
        else:
            self.loop = loop
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        # self.proxy = proxy
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.time_out = time_out
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self._session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()

        self._allowed_paths = None
        if allowed_paths:
            self._allowed_paths = allowed_paths

        self._item_paths = None
        if item_paths:
            self._item_paths = item_paths

        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)

        self.user_agents = self.USER_AGENTS
        if user_agents:
            self._user_agents = user_agents
        self.t0 = time.time()
        self.t1 = None

Beispiel #47

0

Datei anzeigen

Datei: crawler.py Projekt: zhiyue/spin-bot

class BaseCrawler(object):
    USER_AGENTS = [
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'
    ]
    ALLOW_CONTENT_TYPE = ('text/html', 'application/xml')
    ALLOWED_PATHS = None
    ITEM_PATHS = None

    def __init__(self,
                 roots,
                 exclude=None,
                 strict=True,
                 max_redirect=10,
                 proxy=None,
                 max_tries=4,
                 user_agents=None,
                 max_tasks=10,
                 time_out=15,
                 allowed_paths=None,
                 item_paths=None,
                 *,
                 loop=None):
        if not loop:
            asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
            self.loop = asyncio.get_event_loop()
        else:
            self.loop = loop
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        # self.proxy = proxy
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.time_out = time_out
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self._session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()

        self._allowed_paths = None
        if allowed_paths:
            self._allowed_paths = allowed_paths

        self._item_paths = None
        if item_paths:
            self._item_paths = item_paths

        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)

        self.user_agents = self.USER_AGENTS
        if user_agents:
            self._user_agents = user_agents
        self.t0 = time.time()
        self.t1 = None

    @property
    def session(self):
        if not self._session:
            self._session = aiohttp.ClientSession(loop=self.loop)
        return self._session

    @property
    def proxy(self):
        proxy = 'http://{}'.format(
            requests.get("http://127.0.0.1:5010/get/").text)
        logger.info(proxy)
        return proxy

    @property
    def allowed_paths(self):
        if self._allowed_paths is None:
            self._allowed_paths = self.ALLOWED_PATHS
        return self._allowed_paths

    @property
    def item_paths(self):
        if self._item_paths is None:
            self._item_paths = self.ITEM_PATHS
        return self._item_paths

    def host_okay(self, host):
        """Check if a host should be crawled.
    A literal match (after lowercasing) is always good.  For hosts
    that don't look like IP addresses, some approximate matches
    are okay depending on the strict flag.
    """
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        """Check if a host should be crawled, strict-ish version.
      This checks for equality modulo an initial 'www.' component.
      """
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        """Check if a host should be crawled, lenient version.
      This compares the last two components of the host.
      """
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    def get_random_user_agent(self):
        if len(self._user_agents) == 1:
            return self._user_agents
        return random.choice(self._user_agents)

    def close(self):
        self.session.close()

    def add_url(self, url, max_redirect=None, meta=None):
        if meta is None:
            meta = {}
        if max_redirect is None:
            max_redirect = self.max_redirect
        logger.debug('adding %r %r', url, max_redirect)
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect, meta))

    async def parse_item(self, url, data, *args, **kwargs):
        allowed, parse_function = self.parse_item_allowed(url)
        if allowed:
            await parse_function(url, data, *args, **kwargs)

    def parse_item_allowed(self, url):
        if self.item_paths:
            for key, rule in self.item_paths.items():
                if not re.search(rule, url):
                    continue
                return True, self.get_parse_function(key)
        return False, None

    def get_parse_function(self, name):
        parse_function_name = 'parse_{}'.format(name)
        if hasattr(self, parse_function_name):
            return getattr(self, parse_function_name)
        logger.error('Not Implemented method: %r', parse_function_name)
        raise NotImplementedError

    def path_allowed(self, url):
        if self.allowed_paths:
            logger.debug(self.allowed_paths)
            for rule in self.allowed_paths:
                if not re.search(rule, url):
                    continue
                return True
        return False

    async def parse(self, url, response, **kwargs):
        links = set()
        content_type = None
        encoding = None
        body = await response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in self.ALLOW_CONTENT_TYPE:
                data = await response.text()
                links = await self._parse_links(response.url, data)
                await self.parse_item(url, data, **kwargs)

        stat = FetchStatistic(url=response.url.human_repr(),
                              next_url=None,
                              status=response.status,
                              exception=None,
                              size=len(body),
                              content_type=content_type,
                              encoding=encoding,
                              num_urls=len(links),
                              num_new_urls=len(links - self.seen_urls))
        return stat, links

    async def _parse_links(self, base_url, text):
        links = set()

        # Replace href with (?:href|src) to follow image links.
        urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''', text))
        if urls:
            logger.info('got %r distinct urls from %r', len(urls), base_url)
        for url in urls:
            try:
                normalized = urllib.parse.urljoin(base_url.human_repr(), url)
                # normalized = base_url.join(url)
                defragmented, frag = urllib.parse.urldefrag(normalized)
            except TypeError as type_error:
                logger.error('join error happen on base_url: %r, url: %r',
                             base_url, url)
                continue
            if self.url_allowed(defragmented):
                links.add(defragmented)
        return links

    def headers(self, **kwargs):
        headers = {'User-Agent': self.get_random_user_agent()}
        headers.update(**kwargs)
        return headers

    async def fetch(self, url, max_redirect, meta=None):
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                with async_timeout.timeout(self.time_out):
                    headers = self.headers()
                    response = await self.session.get(url,
                                                      headers=headers,
                                                      proxy=self.proxy,
                                                      allow_redirects=False)

                    if tries > 1:
                        logger.info('try %r for %r success', tries, url)

                    break
            except aiohttp.ClientError as client_error:
                logger.info('try %r for %r raised %r', tries, url,
                            client_error)
                exception = client_error
            except asyncio.TimeoutError as timeout_error:
                logger.info('try %r for %r raised %r', tries, url,
                            timeout_error)
                exception = timeout_error
            except Exception as e:
                logger.info('try %r for %r raised %r', tries, url, e)
                exception = e

            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            logger.error('%r failed after %r tries', url, self.max_tries)
            self.record_statistic(
                FetchStatistic(url=url,
                               next_url=None,
                               status=None,
                               exception=exception,
                               size=0,
                               content_type=None,
                               encoding=None,
                               num_urls=0,
                               num_new_urls=0))
            return

        try:
            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url, location)
                self.record_statistic(
                    FetchStatistic(url=url,
                                   next_url=next_url,
                                   status=response.status,
                                   exception=None,
                                   size=0,
                                   content_type=None,
                                   encoding=None,
                                   num_urls=0,
                                   num_new_urls=0))

                if next_url in self.seen_urls:
                    return
                if max_redirect > 0:
                    if self.url_allowed(next_url):
                        logger.info('redirect to %r from %r', next_url, url)
                        self.add_url(next_url, max_redirect - 1)
                else:
                    logger.error('redirect limit reached for %r from %r',
                                 next_url, url)
            else:
                stat, links = await self.parse(url, response)
                self.record_statistic(stat)
                for link in links.difference(self.seen_urls):
                    self.add_url(link, meta=meta)
                self.seen_urls.update(links)
        finally:
            await response.release()

    async def work(self):
        try:
            while True:
                url, max_redirect, meta = await self.q.get()
                assert url in self.seen_urls
                await self.fetch(url, max_redirect, meta)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            logger.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            logger.debug('skipping non-root host in %r', url)
            return False
        return self.path_allowed(url)

    async def crawl(self):
        workers = [
            asyncio.Task(self.work(), loop=self.loop)
            for _ in range(self.max_tasks)
        ]

        self.t0 = time.time()
        await self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()

Beispiel #48

0

Datei anzeigen

Datei: Spider.py Projekt: MosaicHe/cl_spider

class Spider:
    def __init__(self, max_tries=30, max_tasks=10, timeout=5,
                 rootDir=os.getcwd()):
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.loop = asyncio.get_event_loop()
        self.q = Queue(loop=self.loop)
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.timeout = timeout
        self.rootDir = rootDir

    def close(self):
        self.session.close()


    def append_request(self, request):
        self.q.put_nowait(request)


    @asyncio.coroutine
    def _get_request(self):
        r = yield from self.q.get()
        return r

    @asyncio.coroutine
    def fetch(self, request_type, url, params, data):
        """Fetch one URL"""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                print("try %s---->%d times"%(url, tries))
                with aiohttp.Timeout(self.timeout):
                    response = yield from self.session.get(url, params=params)
                    if response.status == 200:
                        content_type = response.headers.get('content-type')
                        if content_type in CONTENT_TYPE_TEXT:
                            with aiohttp.Timeout(self.timeout):
                                content = yield from response.text(encoding='GBK')
                        else:
                            with aiohttp.Timeout(self.timeout):
                                content = yield from response.read()
                    break;
            except asyncio.TimeoutError:
                print("timeout")
            except aiohttp.ClientError as client_error:
                print("client error")
            except Exception:
                print("unknown error")
            tries += 1
        else:
            print("try %s---->more than %d times, quit"%(url, tries))
            return None

        response.release()
        return content

    @asyncio.coroutine
    def _work(self):
        """Process queue items forever."""
        try:
            while True:
                r = yield from self._get_request()
                content = yield from self.fetch(r.request_type, r.url, r.params, r.data)
                if(content):
                    r.handle_func(content)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    @asyncio.coroutine
    def work(self):
        yield from self._work()

    @asyncio.coroutine
    def spider(self):
        """run  the spider until all finished"""
        workers = [asyncio.Task(self.work(),loop=self.loop)
                   for _ in range (self.max_tasks)]
        yield from self.q.join()

        for w in workers:
             w.cancel()

Beispiel #49

0

Datei anzeigen

Datei: crawling.py Projekt: mtarsel/crawlski

class Crawler:
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    """
    def __init__(self, roots,
                 exclude=None, strict=True,  # What to crawl.
                 max_redirect=10, max_tries=4,  # Per-url limits.
                 max_tasks=10, loop=None):

        get_domain(roots)

        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))

        with open(path, 'w') as temp_file:
            print('writing')
            temp_file.write('Domain name:')
            temp_file.write(roots)
            temp_file.write('\n \n')
            temp_file.close()

        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.connector = aiohttp.TCPConnector(loop=self.loop)
        self.root_domains = set()
#        for root in roots:
#            parts = urllib.parse.urlparse(root)
#            host, port = urllib.parse.splitport(parts.netloc)
#            if not host:
#                continue
#            if re.match(r'\A[\d\.]*\Z', host):
#                self.root_domains.add(host)
#            else:
##                host = host.lower()
#                if self.strict:
#                    self.root_domains.add(host)
#                else:
#                    self.root_domains.add(lenient_host(host))
#        for root in roots:
#            print("true root")
#            print(root)
#            self.add_url(root)
        self.add_url(roots)
        self.t0 = time.time()
        self.t1 = None

    def close(self):
        """Close resources."""
        self.connector.close()

    def host_okay(self, host):
        """Check if a host should be crawled.

        A literal match (after lowercasing) is always good.  For hosts
        that don't look like IP addresses, some approximate matches
        are okay depending on the strict flag.
        """
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        """Check if a host should be crawled, strict-ish version.

        This checks for equality modulo an initial 'www.' component.
        """
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        """Check if a host should be crawled, lenient version.

        This compares the last two components of the host.
        """
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    @asyncio.coroutine
    def parse_links(self, response):
        """Return a FetchStatistic and list of links."""
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text()
        #Mick - raw HTML page
                #print(text)
                # Replace href with (?:href|src) to follow image links.
                urls = set(re.findall(r'''(?i)href=["']?([^\s"'<>]+)''',
                                      text))
                if urls:
                    LOGGER.info('got %r distinct urls from %r',
                                len(urls), response.url)

                for url in urls:

                    #if(url.find("/ibm/console/logon.jsp?action=OK"):
                     #   print("There is a login page")

                    normalized = urllib.parse.urljoin(response.url, url)

#                    path = get_domain(str(normalized))

                    with open(path, 'a') as temp_file:
                        temp_file.write(str(normalized) + ',\n')
                        temp_file.close()

                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stat = FetchStatistic(
            url=response.url,
            next_url=None,
            status=response.status,
            exception=None,
            size=len(body),
            content_type=content_type,
            encoding=encoding,
            num_urls=len(links),
            num_new_urls=len(links - self.seen_urls))

        return stat, links

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        """Fetch one URL."""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = yield from aiohttp.request(
                    'get', url,
                    connector=self.connector,
                    allow_redirects=False,
                    loop=self.loop)
                if tries > 1:
                    LOGGER.info('try %r for %r success', tries, url)
                break
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r', tries, url, client_error)
                exception = client_error

            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            LOGGER.error('%r failed after %r tries',
                         url, self.max_tries)
            self.record_statistic(FetchStatistic(url=url,
                                                 next_url=None,
                                                 status=None,
                                                 exception=exception,
                                                 size=0,
                                                 content_type=None,
                                                 encoding=None,
                                                 num_urls=0,
                                                 num_new_urls=0))
            return

        if is_redirect(response):
            location = response.headers['location']
            next_url = urllib.parse.urljoin(url, location)
            self.record_statistic(FetchStatistic(url=url,
                                                 next_url=next_url,
                                                 status=response.status,
                                                 exception=None,
                                                 size=0,
                                                 content_type=None,
                                                 encoding=None,
                                                 num_urls=0,
                                                 num_new_urls=0))

            if next_url in self.seen_urls:
                return
            if max_redirect > 0:
                LOGGER.info('redirect to %r from %r', next_url, url)
                self.add_url(next_url, max_redirect - 1)
            else:
                LOGGER.error('redirect limit reached for %r from %r',
                             next_url, url)
        else:
            stat, links = yield from self.parse_links(response)
            self.record_statistic(stat)
            for link in links.difference(self.seen_urls):
                self.q.put_nowait((link, self.max_redirect))
            self.seen_urls.update(links)


    @asyncio.coroutine
    def work(self):
        """Process queue items forever."""
        while True:
            url, max_redirect = yield from self.q.get()
            assert url in self.seen_urls
            yield from self.fetch(url, max_redirect)
            self.q.task_done()

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            LOGGER.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            LOGGER.debug('skipping non-root host in %r', url)
            return False
        return True

    def add_url(self, url, max_redirect=None):
        """Add a URL to the queue if not seen before."""
        if max_redirect is None:
            max_redirect = self.max_redirect
        LOGGER.debug('adding %r %r', url, max_redirect)

        #TODO Mick - getting a new URL
        #print("new url: ")
        #print(url)

#        path = get_domain(url)

#        with open(path, 'w') as temp_file:
#            print('writing')
#            temp_file.write('Domain name:')
#            temp_file.write(url)
#            temp_file.write('\n \n')
#            temp_file.close()


        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    @asyncio.coroutine
    def crawl(self):
        print("crawling...")
        """Run the crawler until all finished."""
        workers = [asyncio.Task(self.work(), loop=self.loop)
                   for _ in range(self.max_tasks)]
        self.t0 = time.time()
        yield from self.q.join()
        assert self.seen_urls == set(stat.url for stat in self.done)
        self.t1 = time.time()
        for w in workers:
            w.cancel()

Beispiel #50

0

Datei anzeigen

Datei: AsyncBaseCrawler.py Projekt: fanzonezy/AmazonCrawler

class BaseAsyncCrawler(metaclass=CrawlerMetaClass):
    
    def __init__(self, task):
        self.seen_url = set()
        self.max_tasks = 50
        self.max_retry = 10
        self.loop = asyncio.get_event_loop()
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.q = Queue(loop = self.loop)
        
        self.manager = Manager()
        
        self.q.put_nowait(task)
        
        """
        for debug
        """
        self.item_cnt = 0
        self.page_cnt = 0
        self.f = open("D:\\Acer", 'w')
        print('initialization finished.')
                
    def close(self):
        self.session.close()
        self.f.close()
        
    async def crawl(self):
        workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)]
        try:
            await self.q.join()
            for worker in workers:
                worker.cancel()
        except Exception as e:
            LOGGER.error('[in crawl] unexpected error with message: ', e)
            raise e 
            
    async def fetch(self, queuingTask):
        #print('fetching')
    
        """
        unpack task tuple
        """
        url, page_type, info_item = queuingTask
        #print(url)
        """
        try to establish connection
        """
        tries = 0
        while tries < self.max_retry:
            try:
                response = await self.session.get(url, headers = self.REQUEST_HEADERS)
                break
            except aiohttp.ClientError:
                pass
            except Exception as e:
                print(e)
            
            tries += 1
        else:
            LOGGER.warning("fail to connect to "+url)
            return 
          
        #print('connection established.')
        try: 
            text = await response.text()
        except Exception as e:
            print('when get page content: ', e)
            print("fail to get page content from " + url)
        else:            
            """
            parse response
            """
            try:
                
                todo = self.manager.list()
                done = self.manager.list()
                
                #print(self.__dict__)
                #print(hasattr(self, '__mapping__'))
                
                if hasattr(self, '__mapping__'):
                    parser = self.__mapping__[page_type]
                    #print('parsing '+page_type)
                    if parser.parser_type == ParserType.GENERATOR:
                        p = Process(target=parser, args=(todo, done, self, text,))
                    elif parser.parser_type == ParserType.APPENDER:
                        p = Process(target=parser, args=(todo, done, self, text, info_item,))
                    else:
                        raise Exception('fatal: unrecognized parser type')
                    
                    p.start()
                    p.join()
                    
                    for task in todo:
                        self.q.put_nowait(task)
                    
                    for data_item in done:   
                        self.item_cnt += 1
                        self.f.write(str(self.item_cnt) + "#" + str(data_item) + "\n")
                        self.f.flush()
                        print("NO." + str(self.item_cnt) + " item")   
                    
                else:
                    raise Exception('fatal: uninitialized crawler.')
            except Exception as e:
                print(e)
            finally:
                await response.release()
                
    async def work(self):
        try:
            while True:
                queuingTask = await self.q.get()
                await self.fetch(queuingTask)
                self.q.task_done()
        except asyncio.CancelledError:
            pass