Example #1
0
 def test_positional_arguments(self):
     pycurl.CurlMulti(1)
Example #2
0
 def test_remove_invalid_closed_handle(self):
     m = pycurl.CurlMulti()
     c = util.DefaultCurl()
     c.close()
     m.remove_handle(c)
     del m, c
Example #3
0
    def fetch(self, req_url, output=None):
        """
        Fetch a file.

        Parameters
        ----------
        req_url : str
            URL of the file to retrieve
        output : str, optional
            filename, possibly with path, of the downloaded file.

        TODO: test can_segment == false
        """

        (eurl, size, can_segment) = _check_headers(req_url)
        if output is None:
            output = os.path.split(eurl)[1]

        if len(output) < 1:
            raise RuntimeError("Output file must be provided if URL points "
                               "to a directory.")
        LOG.info('Downloading %s, (%d bytes)' % (output, size))
        segments = self._get_segments(size, can_segment)

        # allocate file space
        afile = open(output, str('wb'))
        if size > 0:
            afile.truncate(size)
        afile.close()

        out_file = open(output, str('r+b'))
        connections = []
        for i in range(len(segments)):
            c = Connection(eurl, can_segment)
            connections.append(c)

        con = {
            'connections': connections,
            'free': connections[:],
            'working': []
        }

        start_time = time.time()
        elapsed = None
        mcurl = pycurl.CurlMulti()

        while True:
            while segments and con['free']:
                p = segments.pop(0)
                c = con['free'].pop(0)
                c.prepare(out_file, p)
                con['working'].append(c)
                mcurl.add_handle(c.curl)
                LOG.debug('%s:Start downloading', c.name)

            while True:
                ret, handles_num = mcurl.perform()
                if ret != pycurl.E_CALL_MULTI_PERFORM:
                    break

            while True:
                num_q, ok_list, err_list = mcurl.info_read()
                for curl in ok_list:
                    curl.errno = pycurl.E_OK
                    mcurl.remove_handle(curl)

                    c = curl.connection
                    con['working'].remove(c)
                    c.errno = curl.errno
                    c.errmsg = None
                    c.code = curl.getinfo(pycurl.RESPONSE_CODE)

                    if c.code in STATUS_OK:
                        LOG.info('%s: Download successed. (%d/%d)', c.name,
                                 c.segment_downloaded, c.segment_size)
                        con['free'].append(c)

                    elif c.code in STATUS_ERROR:
                        msg = '%s:Error < %d >! Connection will be closed'
                        LOG.error(msg.format(c.name, c.code))
                        con['connections'].remove(c)
                        c.close()
                        segments.append(c.segment)
                        new_c = Connection(c.getopt(pycurl.URL))
                        con['connections'].append(new_c)
                        con['free'].append(new_c)

                    else:
                        msg = '%s: Unhandled http status code %d'
                        raise Exception(msg.format(c.name, c.code))

                for curl, errno, errmsg in err_list:
                    curl.errno = errno
                    curl.errmsg = errmsg
                    mcurl.remove_handle(curl)

                    c = curl.connection
                    c.errno = curl.errno
                    c.errmsg = curl.errmsg
                    con['working'].remove(c)
                    msg = '%s:Download failed < %s >'
                    LOG.error(msg, c.name, c.errmsg)
                    if c.can_segment and c.retried < self.max_retry:
                        c.prepare_retry()
                        con['working'].append(c)
                        mcurl.add_handle(c.curl)
                        LOG.error('%s:Try again', c.name)
                    else:
                        raise RuntimeError(c.errmsg)

                if num_q == 0:
                    break

            elapsed = time.time() - start_time
            downloaded = sum(
                [connection.total_downloaded for connection in connections])
            _show_progress(size, downloaded, elapsed)

            if not con['working']:
                break

            mcurl.select(1.0)

        msg = 'Download Succeeded! Total Elapsed %ds'.format(elapsed)
        LOG.info(msg)
def fireUp(target_list, num_conn, proxy_addr, proxy_port):
    tuples = target_list
    try:
        import signal
        from signal import SIGPIPE, SIG_IGN
        signal.signal(signal.SIGPIPE, signal.SIG_IGN)
    except ImportError:
        pass
    queue = []
    for tuple in tuples:
        for url in tuple:
            if not url or url[0] == "#":
                url = str(url).strip()
                continue
            cb = callback()
            queue.append((url, cb))
    num_urls = len(queue)
    num_conn = min(num_conn, num_urls)
    assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
    m = pycurl.CurlMulti()
    m.handles = []
    for i in range(num_conn):
        c = pycurl.Curl()
        c.setopt(pycurl.FOLLOWLOCATION, 1)
        c.setopt(pycurl.MAXREDIRS, 5)
        c.setopt(pycurl.CONNECTTIMEOUT, 30)
        c.setopt(pycurl.TIMEOUT, 300)
        c.setopt(pycurl.NOSIGNAL, 1)
        c.setopt(
            pycurl.USERAGENT,
            """Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"""
        )
        c.setopt(pycurl.HTTPHEADER, [
            """Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5""",
            """Accept-Language: en-us,en;q=0.5""",
            """Accept-Encoding: gzip,deflate""",
            """Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7""",
            """Keep-Alive: 300""", """Connection: keep-alive"""
        ])
        if proxy_addr != '':
            c.setopt(pycurl.PROXY, proxy_addr)
            c.setopt(pycurl.PROXYPORT, proxy_port)
        m.handles.append(c)
    freelist = m.handles[:]
    num_processed = 0
    while num_processed < num_urls:
        while queue and freelist:
            url, cb = queue.pop(0)
            c = freelist.pop()
            c.setopt(pycurl.URL, url)
            c.setopt(pycurl.WRITEFUNCTION, cb.feed)
            m.add_handle(c)
            c.url = url
        while 1:
            ret, num_handles = m.perform()
            if ret != pycurl.E_CALL_MULTI_PERFORM:
                break
        while 1:
            num_q, ok_list, err_list = m.info_read()
            for c in ok_list:
                m.remove_handle(c)
                freelist.append(c)
            for c, errno, errmsg in err_list:
                m.remove_handle(c)
                freelist.append(c)
            num_processed = num_processed + len(ok_list) + len(err_list)
            if num_q == 0:
                break
        m.select(1.0)
    for c in m.handles:
        c.close()
    m.close()
Example #5
0
def crawl(urls, sbook, fbook, num_conn=500):

    success_count = 0
    failure_count = 0
    start_time = time.time()
    import sys
    import pycurl

    # We should ignore SIGPIPE when using pycurl.NOSIGNAL

    try:
        import signal
        from signal import SIGPIPE, SIG_IGN
        signal.signal(signal.SIGPIPE, signal.SIG_IGN)
    except ImportError:
        pass

    # Make a queue with (url, filename) tuplev").hes
    queue = []
    for url in urls:
        url = url.strip()
        if not url or url[0] == "#":
            continue
        filename = str(md5.new(url).hexdigest()) + ".uss"
        queue.append((url, filename))

    # Check args
    assert queue, "no URLs given"
    num_urls = len(queue)
    num_conn = min(num_conn, num_urls)
    assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
    print "I got ", num_urls, " URLs to process.. ."

    # Pre-allocate a list of curl objects
    m = pycurl.CurlMulti()
    m.handles = []
    for i in range(num_conn):
        c = pycurl.Curl()
        c.fp = None
        c.setopt(pycurl.FOLLOWLOCATION, 1)
        c.setopt(pycurl.MAXREDIRS, 3)
        c.setopt(pycurl.CONNECTTIMEOUT, 60)
        c.setopt(pycurl.TIMEOUT, 300)
        c.setopt(pycurl.LOW_SPEED_LIMIT, 0)
        c.setopt(pycurl.LOW_SPEED_TIME, 0)
        c.setopt(pycurl.NOSIGNAL, 1)
        m.handles.append(c)

    # Main loop
    freelist = m.handles[:]
    num_processed = 0
    while num_processed < num_urls:

        # If there is an url to process and a free curl object, add to multi stack
        while queue and freelist:
            url, filename = queue.pop(0)
            c = freelist.pop()
            c.fp = open(filename, "wb")
            c.setopt(pycurl.URL, url)
            c.setopt(pycurl.WRITEDATA, c.fp)
            m.add_handle(c)
            # store some info
            c.filename = filename
            c.url = url
        # Run the internal curl state machine for the multi stack
        while 1:
            ret, num_handles = m.perform()
            if ret != pycurl.E_CALL_MULTI_PERFORM:
                break
        # Check for curl objects which have terminated, and add them to the freelist
        while 1:

            num_q, ok_list, err_list = m.info_read()

            for c in ok_list:
                c.fp.close()
                c.fp = None
                m.remove_handle(c)
                success_count += 1
                pattern = "-->" + str(
                    c.filename) + ":::" + str(c.url) + ":::" + str(
                        c.getinfo(pycurl.EFFECTIVE_URL)) + chr(10)
                sbook.write(pattern)
                sbook.flush()
                freelist.append(c)
            for c, errno, errmsg in err_list:
                c.fp.close()
                c.fp = None
                m.remove_handle(c)
                failure_count += 1
                pattern = "-->" + str(c.filename) + ":::" + str(
                    c.url) + ":::" + str(errno) + ":::" + str(errmsg) + chr(10)
                fbook.write(pattern)
                fbook.flush()
                freelist.append(c)
            num_processed = num_processed + len(ok_list) + len(err_list)
            if num_q == 0:
                break
        msg = "Total Processed:" + str(num_processed) + ", Ok:" + str(
            success_count) + ", Not Ok:" + str(
                failure_count) + ", Time:" + str(time.time() - start_time)
        #sys.stdout.write("\r"+str(msg))
        #sys.stdout.flush()
        # Currently no more I/O is pending, could do something in the meantime
        # (display a progress bar, etc.).
        # We just call select() to sleep until some more data is available.
        m.select(1.0)

    # Cleanup
    for c in m.handles:
        if c.fp is not None:
            c.fp.close()
            c.fp = None
        c.close()
    m.close()
    return num_processed, success_count, failure_count
Example #6
0
    def fetch(self):
        """
        Download urls via multicurl.
        
        Get new tasks from queue.
        """
        m = pycurl.CurlMulti()
        m.handles = []

        # Create curl instances
        for x in xrange(self.thread_number):
            curl = pycurl.Curl()
            m.handles.append(curl)

        freelist = m.handles[:]

        # This is infinite cycle
        # You can break it only from outside code which
        # iterates over result of this method
        while True:

            cached_request = None

            while len(freelist):

                # Increase request counter
                if (self.request_limit is not None
                        and self.counters['request'] >= self.request_limit):
                    logging.debug('Request limit is reached: %s' %\
                                  self.request_limit)
                    if len(freelist) == self.thread_number:
                        yield None
                    else:
                        break
                else:
                    try:
                        priority, task = self.taskq.get(True, 0.1)
                    except Empty:
                        # If All handlers are free and no tasks in queue
                        # yield None signal
                        if len(freelist) == self.thread_number:
                            yield None
                        else:
                            break
                    else:
                        if not self._preprocess_task(task):
                            continue

                        task.network_try_count += 1
                        if task.task_try_count == 0:
                            task.task_try_count = 1

                        if task.task_try_count > self.task_try_limit:
                            logging.debug('Task tries ended: %s / %s' %
                                          (task.name, task.url))
                            self.add_item('too-many-task-tries', task.url)
                            continue

                        if task.network_try_count > self.network_try_limit:
                            logging.debug('Network tries ended: %s / %s' %
                                          (task.name, task.url))
                            self.add_item('too-many-network-tries', task.url)
                            continue

                        #import pdb; pdb.set_trace()
                        if task.grab:
                            grab = task.grab
                        else:
                            # Set up curl instance via Grab interface
                            grab = Grab(**self.grab_config)
                            grab.setup(url=task.url)

                        if self.use_cache and not task.get('disable_cache'):
                            if grab.detect_request_method() == 'GET':
                                url = grab.config['url']
                                cache_item = self.cache.find_one({'_id': url})
                                if cache_item:
                                    #if url in self.cache:
                                    #cache_item = pickle.loads(self.cache[url])
                                    #logging.debug('From cache: %s' % url)
                                    cached_request = (grab, grab.clone(), task,
                                                      cache_item)
                                    grab.prepare_request()
                                    self.inc_count('request-cache')

                                    # break from prepre-request cycle
                                    # and go to process-response code
                                    break

                        self.inc_count('request-network')
                        if self.proxylist_config:
                            args, kwargs = self.proxylist_config
                            grab.setup_proxylist(*args, **kwargs)

                        curl = freelist.pop()
                        curl.grab = grab
                        curl.grab.curl = curl
                        curl.grab_original = grab.clone()
                        curl.grab.prepare_request()
                        curl.task = task
                        # Add configured curl instance to multi-curl processor
                        m.add_handle(curl)

            # If there were done network requests
            if len(freelist) != self.thread_number:
                while True:
                    status, active_objects = m.perform()
                    if status != pycurl.E_CALL_MULTI_PERFORM:
                        break

            if cached_request:
                grab, grab_original, task, cache_item = cached_request
                url = task.url  # or grab.config['url']
                grab.fake_response(cache_item['body'])

                def custom_prepare_response(g):
                    g.response.head = cache_item['head'].encode('utf-8')
                    g.response.body = cache_item['body'].encode('utf-8')
                    g.response.code = cache_item['response_code']
                    g.response.time = 0
                    g.response.url = cache_item['url']
                    g.response.parse('utf-8')
                    g.response.cookies = g.extract_cookies()

                grab.process_request_result(custom_prepare_response)

                yield {
                    'ok': True,
                    'grab': grab,
                    'grab_original': grab_original,
                    'task': task,
                    'ecode': None,
                    'emsg': None
                }
                self.inc_count('request')

            while True:
                queued_messages, ok_list, fail_list = m.info_read()

                results = []
                for curl in ok_list:
                    results.append((True, curl, None, None))
                for curl, ecode, emsg in fail_list:
                    results.append((False, curl, ecode, emsg))

                for ok, curl, ecode, emsg in results:
                    res = self.process_multicurl_response(
                        ok, curl, ecode, emsg)
                    m.remove_handle(curl)
                    freelist.append(curl)
                    yield res
                    self.inc_count('request')

                if not queued_messages:
                    break

            m.select(0.5)
Example #7
0
    def setUp(self):
        super(MultiOptionConstantsTest, self).setUp()

        self.m = pycurl.CurlMulti()
Example #8
0
    def __init__(self,
                 url,
                 method,
                 data=None,
                 kerberos_auth=False,
                 allow_redirects=True,
                 verify_ssl=True,
                 ca=None,
                 use_json=False,
                 headers=None,
                 stream=False,
                 username=None,
                 password=None,
                 client_cert=None,
                 client_key=None,
                 verbose=False):
        self.finished = False  # have we read all data?
        self.closed = False  # have we destroyed curl resources?

        self.status_code = 0
        self.headers = None
        self.response_buffer = BytesIO()
        self.headers_buffer = BytesIO()
        self.response_decoder = None

        self.url = url
        headers = headers or {}
        method = method.lower()

        self.c = pycurl.Curl()
        self.curl_multi = pycurl.CurlMulti()

        if method == 'post':
            self.c.setopt(pycurl.POST, 1)
            headers["Expect"] = ""  # openshift can't handle Expect
        elif method == 'get':
            self.c.setopt(pycurl.HTTPGET, 1)
        elif method == 'put':
            # self.c.setopt(pycurl.PUT, 1)
            self.c.setopt(pycurl.CUSTOMREQUEST, b"PUT")
            headers["Expect"] = ""
        elif method == 'delete':
            self.c.setopt(pycurl.CUSTOMREQUEST, b"DELETE")
        else:
            raise RuntimeError("Unsupported method '%s' for curl call!" %
                               method)

        self.c.setopt(pycurl.COOKIEFILE, b'')
        self.c.setopt(pycurl.URL, str(url))
        self.c.setopt(pycurl.WRITEFUNCTION, self.response_buffer.write)
        self.c.setopt(pycurl.HEADERFUNCTION, self.headers_buffer.write)
        self.c.setopt(pycurl.DEBUGFUNCTION, self._curl_debug)
        self.c.setopt(pycurl.SSL_VERIFYPEER, 1 if verify_ssl else 0)
        self.c.setopt(pycurl.SSL_VERIFYHOST, 2 if verify_ssl else 0)
        if ca:
            logger.info("Setting CAINFO to %r", ca)
            self.c.setopt(pycurl.CAINFO, ca)

        self.c.setopt(pycurl.VERBOSE, 1 if verbose else 0)
        if username and password:
            username = username.encode('utf-8')
            password = password.encode('utf-8')
            self.c.setopt(pycurl.USERPWD, username + b":" + password)

        if client_cert and client_key:
            self.c.setopt(pycurl.SSLCERTTYPE, "PEM")
            self.c.setopt(pycurl.SSLKEYTYPE, "PEM")
            self.c.setopt(pycurl.SSLCERT, client_cert)
            self.c.setopt(pycurl.SSLKEY, client_key)

        if data:
            # curl sets the method to post if one sets any POSTFIELDS (even '')
            self.c.setopt(pycurl.POSTFIELDS, data)

        if use_json:
            headers['Content-Type'] = b'application/json'

        if allow_redirects:
            self.c.setopt(pycurl.FOLLOWLOCATION, 1)

        if kerberos_auth:
            self.c.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_GSSNEGOTIATE)
            self.c.setopt(pycurl.USERPWD, b':')

        if stream:
            headers['Cache-Control'] = b'no-cache'

        if headers:
            header_list = []
            for header_key, header_value in headers.items():
                header_list.append(str("%s: %s" % (header_key, header_value)))
            self.c.setopt(pycurl.HTTPHEADER, header_list)

        self.curl_multi.add_handle(self.c)

        # Send request and read all headers. We have all headers once we receive some data or once
        # the response ends.
        # NOTE: HTTP response in chunked encoding can contain additional headers ("trailers") in the
        # last chunk. This is not handled here.
        while not (self.finished or self._any_data_received()):
            self._select()
            self._perform()

        self.headers = parse_headers(self.headers_buffer.getvalue())
        self.status_code = self.c.getinfo(pycurl.HTTP_CODE)
        self.response_decoder = codecs.getincrementaldecoder(self.encoding)()
Example #9
0
    def test_multi_socket_select(self):
        sockets = set()
        timeout = 0

        urls = [
            'http://localhost:8380/success',
            'http://localhost:8381/success',
            'http://localhost:8382/success',
        ]

        socket_events = []
        
        # socket callback
        def socket(event, socket, multi, data):
            if event == pycurl.POLL_REMOVE:
                #print("Remove Socket %d"%socket)
                sockets.remove(socket)
            else:
                if socket not in sockets:
                    #print("Add socket %d"%socket)
                    sockets.add(socket)
            socket_events.append((event, multi))

        # init
        m = pycurl.CurlMulti()
        m.setopt(pycurl.M_PIPELINING, 1)
        m.setopt(pycurl.M_SOCKETFUNCTION, socket)
        m.handles = []
        for url in urls:
            c = pycurl.Curl()
            # save info in standard Python attributes
            c.url = url
            c.body = util.StringIO()
            c.http_code = -1
            m.handles.append(c)
            # pycurl API calls
            c.setopt(c.URL, c.url)
            c.setopt(c.WRITEFUNCTION, c.body.write)
            m.add_handle(c)

        # get data
        num_handles = len(m.handles)

        while (pycurl.E_CALL_MULTI_PERFORM==m.socket_all()[0]):
            pass
            
        timeout = m.timeout()

        # timeout might be -1, indicating that all work is done
        # XXX make sure there is always work to be done here?
        while timeout >= 0:
            (rr, wr, er) = select.select(sockets,sockets,sockets,timeout/1000.0)
            socketSet = set(rr+wr+er)
            if socketSet:
                for s in socketSet:
                    while True:
                        (ret,running) = m.socket_action(s,0)
                        if ret!=pycurl.E_CALL_MULTI_PERFORM:
                            break
            else:
                (ret,running) = m.socket_action(pycurl.SOCKET_TIMEOUT,0)
            if running==0:
                break

        for c in m.handles:
            # save info in standard Python attributes
            c.http_code = c.getinfo(c.HTTP_CODE)

        # at least in and remove events per socket
        assert len(socket_events) >= 6, 'Less than 6 socket events: %s' % repr(socket_events)

        # print result
        for c in m.handles:
            self.assertEqual('success', c.body.getvalue())
            self.assertEqual(200, c.http_code)
            
            # multi, not curl handle
            self.check(pycurl.POLL_IN, m, socket_events)
            self.check(pycurl.POLL_REMOVE, m, socket_events)
        
        # close handles
        for c in m.handles:
            # pycurl API calls
            m.remove_handle(c)
            c.close()
        m.close()
Example #10
0
def crawl(urls, sbook, fbook, num_conn=500):
    #! /usr/bin/env python
    # -*- coding: iso-8859-1 -*-
    # vi:ts=4:et
    # $Id: retriever-multi.py,v 1.29 2005/07/28 11:04:13 mfx Exp $

    #
    # Usage: python retriever-multi.py <file with URLs to fetch> [<# of
    #          concurrent connections>]
    #

    import sys
    import pycurl

    # We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
    # the libcurl tutorial for more info.
    try:
        import signal
        from signal import SIGPIPE, SIG_IGN
        signal.signal(signal.SIGPIPE, signal.SIG_IGN)
    except ImportError:
        pass

    # Make a queue with (url, filename) tuplev").hes
    queue = []
    for url in urls:
        url = url.strip()
        if not url or url[0] == "#":
            continue
        filename = str(md5.new(url).hexdigest()) + ".uss"
        queue.append((url, filename))

    # Check args
    assert queue, "no URLs given"
    num_urls = len(queue)
    num_conn = min(num_conn, num_urls)
    assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
    print "PycURL %s (compiled against 0x%x)" % (
        pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)
    print "----- Getting", num_urls, "URLs using", num_conn, "connections -----"

    # Pre-allocate a list of curl objects
    m = pycurl.CurlMulti()
    m.handles = []
    for i in range(num_conn):
        c = pycurl.Curl()
        c.fp = None
        c.setopt(pycurl.FOLLOWLOCATION, 1)
        c.setopt(pycurl.MAXREDIRS, 3)
        c.setopt(pycurl.CONNECTTIMEOUT, 120)
        c.setopt(pycurl.TIMEOUT, 300)
        c.setopt(pycurl.NOSIGNAL, 1)
        m.handles.append(c)

    # Main loop
    freelist = m.handles[:]
    num_processed = 0
    while num_processed < num_urls:

        # If there is an url to process and a free curl object, add to multi stack
        while queue and freelist:
            url, filename = queue.pop(0)
            c = freelist.pop()
            c.fp = open(filename, "wb")
            c.setopt(pycurl.URL, url)
            c.setopt(pycurl.WRITEDATA, c.fp)
            m.add_handle(c)
            # store some info
            c.filename = filename
            c.url = url
        # Run the internal curl state machine for the multi stack
        while 1:
            ret, num_handles = m.perform()
            if ret != pycurl.E_CALL_MULTI_PERFORM:
                break
        # Check for curl objects which have terminated, and add them to the freelist
        while 1:

            num_q, ok_list, err_list = m.info_read()

            for c in ok_list:
                c.fp.close()
                c.fp = None
                m.remove_handle(c)
                print "Success:", c.filename, c.url, c.getinfo(
                    pycurl.EFFECTIVE_URL)
                pattern = "-->" + str(
                    c.filename) + ":::" + str(c.url) + ":::" + str(
                        c.getinfo(pycurl.EFFECTIVE_URL)) + chr(10)
                sbook.write(pattern)
                sbook.flush()
                freelist.append(c)
            for c, errno, errmsg in err_list:
                c.fp.close()
                c.fp = None
                m.remove_handle(c)
                print "Failed: ", c.filename, c.url, errno, errmsg
                pattern = "-->" + str(c.filename) + ":::" + str(
                    c.url) + ":::" + str(errno) + ":::" + str(errmsg) + chr(10)
                fbook.write(pattern)
                fbook.flush()
                freelist.append(c)
            num_processed = num_processed + len(ok_list) + len(err_list)
            if num_q == 0:
                break
        # Currently no more I/O is pending, could do something in the meantime
        # (display a progress bar, etc.).
        # We just call select() to sleep until some more data is available.
        m.select(1.0)

    # Cleanup
    for c in m.handles:
        if c.fp is not None:
            c.fp.close()
            c.fp = None
        c.close()
    m.close()
Example #11
0
def curlcrawl(urls, num_conn=1, maxlink=100, dumpdir=None, mode=0750):
    """    
    crawl a list of sites. 
OA    this function contains urlmap dict which we keeps on growing.
    Ideally there should be very limited amount of urls.
    This method cause memory to bloat. Need to improve upon this.
    """

    totalfetched = 0
    try:
        import signal
        from signal import SIGPIPE, SIG_IGN
        signal.signal(signal.SIGPIPE, signal.SIG_IGN)
    except ImportError:
        pass
    queue = []
    # urlmap will keep on growing so we need to call this functon with better strcuture

    urlmap = dict()
    linkcounts = dict()
    globalbuffer = dict()
    for url in urls:
        url = url.strip()
        if not url:
            continue
        queue.append(url)
        linkcounts[urlparse.urlparse(url).netloc] = 0
        urlmap[url] = urlparse.urlparse(url).netloc

#    print queue
    num_urls = len(queue)
    num_conn = min(num_conn, num_urls)

    assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
    m = pycurl.CurlMulti()
    m.handles = []

    for i in range(num_conn):
        c = pycurl.Curl()
        c.fp = None
        c.setopt(pycurl.FOLLOWLOCATION, 1)
        c.setopt(pycurl.MAXREDIRS, 5)
        c.setopt(pycurl.CONNECTTIMEOUT, 30)
        c.setopt(pycurl.TIMEOUT, 300)
        c.setopt(pycurl.NOSIGNAL, 1)
        m.handles.append(c)

    freelist = m.handles[:]
    num_processed = 0
    while num_processed < num_urls:
        # If there is an url to process and a free curl object, add to multi stack
        while queue and freelist:
            url = queue.pop(0)
            c = freelist.pop()
            #            globalbuffer[url] = StringIO.StringIO()
            globalbuffer[url] = urlbuffer(dumpdir, url, mode)
            c.fp = globalbuffer[url].write
            # print url
            c.setopt(pycurl.URL, url.encode('utf-8'))

            # in following use WRITE_FUNC to write the data STORE use from StringIO import StringIO
            c.setopt(pycurl.WRITEFUNCTION, c.fp)
            #    print 'adding url ', ' ', url
            m.add_handle(c)
            # store some info
            c.url = url
        # Run the internal curl state machine for the multi stack
        while 1:
            ret, num_handles = m.perform()
            if ret != pycurl.E_CALL_MULTI_PERFORM:
                break
        # Check for curl objects which have terminated, and add them to the freelist
        while 1:
            # numq is the number of messages still queued
            num_q, ok_list, err_list = m.info_read()
            for c in ok_list:
                c.fp = None
                m.remove_handle(c)
                eurl = c.getinfo(pycurl.EFFECTIVE_URL)
                parent = urlmap[c.url]
                freelist.append(c)

                if linkcounts[parent] > maxlink:
                    htmlc = None
                links = getlinks(globalbuffer[c.url].getvalue(), eurl, parent)
                #   print 'gt links' + len(links)
                #    if dumpdir is not None:

                for link in links:
                    if linkcounts[parent] <= maxlink and urlmap.get(
                            link) is None:
                        queue.append(link)
                        urlmap[link] = parent
                        num_urls = num_urls + 1
                        linkcounts[parent] = linkcounts[parent] + 1
                totalfetched = totalfetched + 1
        #       print 'fetched ', ' ', eurl

            for c, errno, errmsg in err_list:
                c.fp = None
                m.remove_handle(c)
                #               print "Failed: ", c.url, errno, errmsg
                freelist.append(c)
            num_processed = num_processed + len(ok_list) + len(err_list)
            if num_q == 0:
                break
        # Currently no more I/O is pending, could do something in the meantime
        # (display a progress bar, etc.).
        # We just call select() to sleep until some more data is available.
        m.select(1.0)
    for c in m.handles:
        if c.fp is not None:
            c.fp = None
            c.close()
    m.close()
    return totalfetched
Example #12
0
def download(
    concurrent_connections,
    iterator,
    save_result,
    max_redirects=DEFAULT_MAX_REDIRECTS,
    connect_timeout_seconds=DEFAULT_CONNECT_TIMEOUT_SECONDS,
    timeout_seconds=DEFAULT_TIMEOUT_SECONDS,
):
    # We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
    # the libcurl tutorial for more info.
    signal.signal(SIGPIPE, SIG_IGN)
    curl_multi = pycurl.CurlMulti()
    curl_multi.handles = []
    for i in range(concurrent_connections):
        curl = pycurl.Curl()
        curl.fp = None
        curl.setopt(pycurl.FOLLOWLOCATION, 1)
        curl.setopt(pycurl.MAXREDIRS, max_redirects)
        curl.setopt(pycurl.CONNECTTIMEOUT, connect_timeout_seconds)
        curl.setopt(pycurl.TIMEOUT, int(timeout_seconds))
        curl.setopt(pycurl.NOSIGNAL, 1)
        curl_multi.handles.append(curl)
    try:
        freelist = curl_multi.handles[:]
        while True:
            while len(freelist) > 0:
                urlobj = next(iterator)
                if urlobj is None:
                    break
                else:
                    curl = freelist.pop()
                    curl.setopt(pycurl.URL, urlobj['url'])
                    curl.fp = open(urlobj['output_filename'], "wb")
                    curl.hfp = open(
                        urlobj['header_filename'], "wb") if urlobj.get(
                            'header_filename') is not None else None
                    curl.setopt(pycurl.WRITEDATA, curl.fp)
                    if curl.hfp is not None:
                        curl.setopt(pycurl.WRITEHEADER, curl.hfp)
                    curl_multi.add_handle(curl)
                    curl.urlobj = urlobj
            if len(freelist) == concurrent_connections:
                time.sleep(SLEEP_TIME_SECONDS_IF_NONE_RUNNING)
            else:
                while True:
                    ret, num_running_handles = curl_multi.perform()
                    if ret != pycurl.E_CALL_MULTI_PERFORM:
                        break
                while True:
                    num_handles_in_queue, ok_list, err_list = curl_multi.info_read(
                    )
                    for curl in ok_list:
                        curl.fp.close()
                        curl.fp = None
                        if curl.hfp is not None:
                            curl.hfp.close()
                            curl.hfp = None
                        curl_multi.remove_handle(curl)
                        save_result(curl.urlobj,
                                    response_code=curl.getinfo(
                                        pycurl.RESPONSE_CODE))
                        curl.urlobj = None
                        freelist.append(curl)
                    for curl, errno, errmsg in err_list:
                        curl.fp.close()
                        curl.fp = None
                        if curl.hfp is not None:
                            curl.hfp.close()
                            curl.hfp = None
                        curl_multi.remove_handle(curl)
                        save_result(curl.urlobj, errno=errno, errmsg=errmsg)
                        curl.urlobj = None
                        freelist.append(curl)
                    if num_handles_in_queue == 0:
                        break
                curl_multi.select(1.0)
    finally:
        for curl in curl_multi.handles:
            if getattr(curl, 'fp', None) is not None:
                curl.fp.close()
                curl.fp = None
            if getattr(curl, 'hfp', None) is not None:
                curl.hfp.close()
                curl.hfp = None
            curl.urlobj = None
            curl.close()
        curl_multi.close()
Example #13
0
def muti_curl():
    # 最大连接数
    num_conn = 20

    queue = []
    # urls = ['http://www.baidu.com/'] * 10
    urls = ['http://127.0.0.1:8082/'] * 10
    for url in urls:
        queue.append(url)

    num_urls = len(queue)
    num_conn = min(num_conn, num_urls)
    print('----- Getting', num_urls, 'Max conn', num_conn, 'connections -----')

    m = pycurl.CurlMulti()
    # 初始化handle,可复用
    m.handles = []
    for i in range(num_conn):
        c = pycurl.Curl()
        # c.body = StringIO()
        c.body = BytesIO()
        c.setopt(pycurl.FOLLOWLOCATION, 1)
        c.setopt(pycurl.MAXREDIRS, 5)
        c.setopt(pycurl.CONNECTTIMEOUT, 30)
        c.setopt(pycurl.TIMEOUT, 300)
        c.setopt(pycurl.NOSIGNAL, 1)
        m.handles.append(c)

    freelist = m.handles[:]
    num_processed = 0
    # 主循环开始

    while num_processed < num_urls:

        # 添加请求URL
        while queue and freelist:
            url = queue.pop()
            c = freelist.pop()
            c.setopt(pycurl.URL, url)
            c.setopt(pycurl.WRITEFUNCTION, c.body.write)
            m.add_handle(c)
            c.url = url
            # print url

        # 执行请求
        while 1:
            (ret, num_handles) = m.perform()
            if ret != pycurl.E_CALL_MULTI_PERFORM:
                break

        # 阻塞一会直到有连接完成
        m.select(1.0)

        # 读取完成的连接
        while 1:
            (num_q, ok_list, err_list) = m.info_read()
            for c in ok_list:
                m.remove_handle(c)
                # print c.body.getvalue()
                freelist.append(c)

            for (c, errno, errmsg) in err_list:
                m.remove_handle(c)
                print('Failed: ', c.url, errno, errmsg)
                freelist.append(c)
            num_processed = num_processed + len(ok_list) + len(err_list)
            if num_q == 0:
                break

    for c in m.handles:
        c.fp = None
        c.close()
    m.close()
Example #14
0
 def test_keyword_arguments(self):
     pycurl.CurlMulti(a=1)
Example #15
0
 def __init__(self, in_max_requests = 10, in_options = {}):
     self.max_requests = in_max_requests
     self.options = in_options
     
     self.outstanding_requests = {}
     self.multi_handle = pycurl.CurlMulti()
Example #16
0
def download(url, target_file=None):
    "Download a file in parts, displaying progress. This function returns when the download is complete."
    sys.stdout.write("\033[s")

    manager = pycurl.CurlMulti()
    state, state_shelf_file_name = get_state_shelf(url, target_file)

    if state and "file_size" in state:
        # Continuation of earlier run
        state["parts"] = {}
        state["new_handles"] = []
        state["manager"] = manager
        state["fd"] = open(state["target_file"], "rb+")
        download_loop_handle_info_read(state)
    else:
        state.update({
            "parts": {},
            "done": IntervalSet(),
            "url": url,
            "manager": manager,
            "new_handles": [],
            "target_file": target_file
        })
        gen_curl(state, "0-")

    # cURL main loop:
    # Handle cURL events and add new handles as they become available (they are
    # spawned above in the progress callback)
    while True:
        while state["new_handles"]:
            manager.add_handle(state["new_handles"].pop(0))
        ret, num_handles = manager.perform()
        download_loop_handle_info_read(state)
        if ret != pycurl.E_CALL_MULTI_PERFORM:
            break
    while num_handles or state["new_handles"]:
        update_progress(state)
        while state["new_handles"]:
            manager.add_handle(state["new_handles"].pop(0))
        if manager.select(1) == -1:
            continue
        while True:
            while state["new_handles"]:
                manager.add_handle(state["new_handles"].pop(0))
            ret, num_handles = manager.perform()
            download_loop_handle_info_read(state)
            if ret != pycurl.E_CALL_MULTI_PERFORM:
                break

    # Write remaining data
    for download in state["parts"]:
        flush_buffer(state, download)

    update_progress(state)
    if "fd" in state:
        state["fd"].close()
    state.sync()

    fail_state = False
    if "canceled" in state:
        # KeyboardInterrupt -- do not delete intermediates in this case.
        fail_state = True
    else:
        # Check for errors
        if "file_size" in state and state["file_size"]:
            covered = IntervalSet()
            covered.add((0, state["file_size"] - 1))
            left_over = covered - state["done"]
            if left_over.contained:
                print(
                    "\nSome parts of the file failed to download, namely bytes %s"
                    % left_over)
                fail_state = True

        for download, dl_state in list(state["parts"].items()):
            if "cancel_status" in dl_state and dl_state[
                    "cancel_status"] == "deliberate":
                continue
            errstr = download.errstr()
            if errstr:
                if not fail_state:
                    print()
                print(errstr)
                fail_state = True

    if fail_state:
        bytes_written = sum([b - a + 1 for a, b in state["done"].contained])
        if not bytes_written:
            os.unlink(state["target_file"])
            del state
            os.unlink(state_shelf_file_name)
    else:
        error_output("Download finished.")
        del state
        os.unlink(state_shelf_file_name)

    return fail_state
Example #17
0
def download(queue_type,
             queue_directory,
             output_directory,
             concurrent_connections,
             max_downloads=None):
    if queue_type == 'timedout':
        timeout_seconds = MAX_TIMEOUT_SECONDS
    else:
        timeout_seconds = MIN_TIMEOUT_SECONDS
    # We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
    # the libcurl tutorial for more info.
    signal.signal(SIGPIPE, SIG_IGN)
    m = pycurl.CurlMulti()
    m.handles = []
    for i in range(int(concurrent_connections)):
        c = pycurl.Curl()
        c.fp = None
        c.setopt(pycurl.FOLLOWLOCATION, 1)
        c.setopt(pycurl.MAXREDIRS, DOWNLOAD_MAX_REDIRECTS)
        c.setopt(pycurl.CONNECTTIMEOUT, DOWNLOAD_CONNECT_TIMEOUT)
        c.setopt(pycurl.TIMEOUT, int(timeout_seconds))
        c.setopt(pycurl.NOSIGNAL, 1)
        m.handles.append(c)
    start_time = datetime.datetime.now()
    try:
        while True:
            num_processed = 0
            reached_max_downloads = False
            total_read_lines = 0
            domains_last_start_times = {}
            skipped_due_to_domain_start_time = 0
            total_stats = {
                'num_existing_hash_id': 0,
                'num_new_hash_id': 0,
                'num_error_urls': 0,
                'num_timeout_urls': 0
            }
            freelist = m.handles[:]
            eof = False
            downloaded_url_ids = set()
            if os.path.exists(os.path.join(queue_directory, 'output.txt')):
                with open(os.path.join(queue_directory, 'output.txt')) as f:
                    for line in f:
                        downloaded_url_ids.add(int(line.strip()))
            with open(os.path.join(queue_directory, 'output.txt'),
                      'a') as output_file:
                with tempfile.TemporaryDirectory() as tmpdir:

                    def save_result(url,
                                    url_id,
                                    errno=None,
                                    errmsg=None,
                                    response_code=None):
                        is_timeout = errno == pycurl.E_OPERATION_TIMEDOUT
                        now = datetime.datetime.now()
                        url_relative_output_dir = os.path.join(
                            str(now.year), str(now.month), str(now.day),
                            str(now.hour), str(now.minute), str(url_id))
                        url_output_dir = os.path.join(output_directory,
                                                      url_relative_output_dir)
                        url_relative_output_filename = os.path.join(
                            url_relative_output_dir, "output")
                        output_filename = os.path.join(tmpdir, str(url_id),
                                                       "output")
                        header_filename = os.path.join(tmpdir, str(url_id),
                                                       "header")
                        hash_id = None
                        if errno is None and response_code == 200:
                            filesize = os.path.getsize(output_filename)
                            if filesize > 0:
                                hasher = hashlib.sha256()
                                with open(output_filename, 'rb') as f:
                                    buf = f.read(HASH_BLOCKSIZE)
                                    while len(buf) > 0:
                                        hasher.update(buf)
                                        buf = f.read(HASH_BLOCKSIZE)
                                hash = hasher.hexdigest()
                                try:
                                    db.execute(
                                        "insert into hash (hash, size_bytes, download_path, downloaded_at) values (%s, %s, %s, %s)",
                                        (hash, filesize,
                                         url_relative_output_filename,
                                         datetime.datetime.now().strftime(
                                             DATETIME_FORMAT)))
                                    total_stats['num_new_hash_id'] += 1
                                    os.makedirs(url_output_dir)
                                    os.rename(
                                        output_filename,
                                        os.path.join(
                                            output_directory,
                                            url_relative_output_filename))
                                except db.UniqueViolation:
                                    total_stats['num_existing_hash_id'] += 1
                                    os.unlink(output_filename)
                                hash_id = db.only_one(
                                    "select id from hash where hash=%s and size_bytes=%s",
                                    (hash, filesize))['id']
                        else:
                            if is_timeout:
                                total_stats['num_timeout_urls'] += 1
                            else:
                                total_stats['num_error_urls'] += 1
                            os.unlink(output_filename)
                        os.unlink(header_filename)
                        os.rmdir(os.path.join(tmpdir, str(url_id)))
                        url_update_history_id = db.only_one(
                            "insert into url_update_history (url_id, updated_at, hash_id, error, error_code, timedout_seconds) values (%s, %s, %s, %s, %s, %s) RETURNING id",
                            (url_id,
                             datetime.datetime.now().strftime(DATETIME_FORMAT),
                             hash_id, errmsg, errno or response_code,
                             timeout_seconds if is_timeout else None))['id']
                        try:
                            db.execute(
                                "insert into url_last_update (url_id, url_update_history_id) values (%s, %s)",
                                (url_id, url_update_history_id))
                        except db.UniqueViolation:
                            db.execute(
                                "update url_last_update set url_update_history_id=%s where url_id=%s",
                                (url_update_history_id, url_id))
                        if hash_id:
                            try:
                                db.execute(
                                    "insert into url_last_successful_update (url_id, url_update_history_id) values (%s, %s)",
                                    (url_id, url_update_history_id))
                            except db.UniqueViolation:
                                db.execute(
                                    "update url_last_successful_update set url_update_history_id=%s where url_id=%s",
                                    (url_update_history_id, url_id))
                        output_file.write(str(url_id) + "\n")

                    with open(os.path.join(queue_directory,
                                           'queue.txt')) as queue_file:
                        while True:
                            while freelist and not eof:
                                line = queue_file.readline()
                                if line == '':
                                    eof = True
                                else:
                                    total_read_lines += 1
                                    tmp = line.strip().split(" ")
                                    url_id, url = tmp[0], ' '.join(tmp[1:])
                                    if int(url_id) not in downloaded_url_ids:
                                        domain = url.split('://')[1].split(
                                            '/')[0]
                                        domain_last_start_time = domains_last_start_times.get(
                                            domain, None)
                                        now = datetime.datetime.now()
                                        if not domain_last_start_time or (
                                                now - domain_last_start_time
                                        ).total_seconds(
                                        ) >= DOWNLOAD_DOMAIN_THROTTLE_SECONDS:
                                            domains_last_start_times[
                                                domain] = now
                                            c = freelist.pop()
                                            os.mkdir(
                                                os.path.join(
                                                    tmpdir, str(url_id)))
                                            c.fp = open(
                                                os.path.join(
                                                    tmpdir, str(url_id),
                                                    "output"), "wb")
                                            c.hfp = open(
                                                os.path.join(
                                                    tmpdir, str(url_id),
                                                    "header"), "wb")
                                            c.setopt(pycurl.URL, url)
                                            c.setopt(pycurl.WRITEDATA, c.fp)
                                            c.setopt(pycurl.WRITEHEADER, c.hfp)
                                            m.add_handle(c)
                                            c.url_id = url_id
                                            c.url = url
                                        else:
                                            skipped_due_to_domain_start_time += 1
                            while True:
                                ret, num_handles = m.perform()
                                if ret != pycurl.E_CALL_MULTI_PERFORM:
                                    break
                            while True:
                                num_q, ok_list, err_list = m.info_read()
                                for c in ok_list:
                                    c.fp.close()
                                    c.fp = None
                                    c.hfp.close()
                                    c.hfp = None
                                    m.remove_handle(c)
                                    save_result(c.url,
                                                c.url_id,
                                                response_code=c.getinfo(
                                                    pycurl.RESPONSE_CODE))
                                    freelist.append(c)
                                for c, errno, errmsg in err_list:
                                    c.fp.close()
                                    c.fp = None
                                    c.hfp.close()
                                    c.hfp = None
                                    m.remove_handle(c)
                                    save_result(c.url,
                                                c.url_id,
                                                errno=errno,
                                                errmsg=errmsg)
                                    freelist.append(c)
                                num_processed = num_processed + len(
                                    ok_list) + len(err_list)
                                if max_downloads and num_processed >= int(
                                        max_downloads):
                                    reached_max_downloads = True
                                    break
                                if num_q == 0:
                                    break
                            if num_q == 0 and num_handles == 0 and eof:
                                break
                            if max_downloads and num_processed >= int(
                                    max_downloads):
                                reached_max_downloads = True
                                break
                            m.select(1.0)
            if len(downloaded_url_ids) == total_read_lines:
                break
            elif (start_time - datetime.datetime.now()
                  ).total_seconds() > MAX_DOWNLOAD_RUNTIME_SECONDS:
                break
            else:
                time.sleep(DOWNLOAD_ITERATIONS_SLEEP_SECONDS)
    finally:
        for c in m.handles:
            if getattr(c, 'fp', None) is not None:
                c.fp.close()
                c.fp = None
            if getattr(c, 'hfp', None) is not None:
                c.hfp.close()
                c.hfp = None
            c.close()
        m.close()
    return (len(downloaded_url_ids), num_processed, reached_max_downloads,
            total_read_lines, skipped_due_to_domain_start_time,
            total_stats['num_existing_hash_id'],
            total_stats['num_new_hash_id'], total_stats['num_error_urls'],
            total_stats['num_timeout_urls'])
Example #18
0
    def check_pause(self, call):
        # the app sleeps for 0.5 seconds
        self.curl.setopt(pycurl.URL, 'http://localhost:8380/pause')
        sio = util.BytesIO()
        state = dict(paused=False, resumed=False)
        if call:

            def writefunc(data):
                rv = sio.write(data)
                if not state['paused']:
                    self.curl.pause(pycurl.PAUSE_ALL)
                    state['paused'] = True
                return rv
        else:

            def writefunc(data):
                if not state['paused']:
                    # cannot write to sio here, because
                    # curl takes pause return value to mean that
                    # nothing was written
                    state['paused'] = True
                    return pycurl.READFUNC_PAUSE
                else:
                    return sio.write(data)

        def resume(*args):
            state['resumed'] = True
            self.curl.pause(pycurl.PAUSE_CONT)

        signal.signal(signal.SIGALRM, resume)
        # alarm for 1 second which is 0.5 seconds more than the server side
        # should sleep for
        signal.alarm(1)
        start = _time.time()
        self.curl.setopt(pycurl.WRITEFUNCTION, writefunc)

        m = pycurl.CurlMulti()
        m.add_handle(self.curl)

        # Number of seconds to wait for a timeout to happen
        SELECT_TIMEOUT = 1.0

        # Stir the state machine into action
        while 1:
            ret, num_handles = m.perform()
            if ret != pycurl.E_CALL_MULTI_PERFORM:
                break

        # Keep going until all the connections have terminated
        while num_handles:
            # The select method uses fdset internally to determine which file descriptors
            # to check.
            m.select(SELECT_TIMEOUT)
            while 1:
                if _time.time() - start > 2:
                    # test is taking too long, fail
                    assert False, 'Test is taking too long'
                ret, num_handles = m.perform()
                if ret != pycurl.E_CALL_MULTI_PERFORM:
                    break

        # Cleanup
        m.remove_handle(self.curl)
        m.close()

        self.assertEqual('part1part2', sio.getvalue().decode())
        end = _time.time()
        # check that client side waited
        self.assertTrue(end - start > 1)

        assert state['resumed']
Example #19
0
    def test_multi_socket(self):
        urls = [
            'http://localhost:8380/success',
            'http://localhost:8381/success',
            'http://localhost:8382/success',
        ]

        socket_events = []

        # socket callback
        def socket(event, socket, multi, data):
            #print(event, socket, multi, data)
            socket_events.append((event, multi))

        # init
        m = pycurl.CurlMulti()
        m.setopt(pycurl.M_PIPELINING, 1)
        m.setopt(pycurl.M_SOCKETFUNCTION, socket)
        m.handles = []
        for url in urls:
            c = pycurl.Curl()
            # save info in standard Python attributes
            c.url = url
            c.body = util.StringIO()
            c.http_code = -1
            m.handles.append(c)
            # pycurl API calls
            c.setopt(c.URL, c.url)
            c.setopt(c.WRITEFUNCTION, c.body.write)
            m.add_handle(c)

        # get data
        num_handles = len(m.handles)
        while num_handles:
            while 1:
                ret, num_handles = m.socket_all()
                if ret != pycurl.E_CALL_MULTI_PERFORM:
                    break
            # currently no more I/O is pending, could do something in the meantime
            # (display a progress bar, etc.)
            m.select(0.1)

        for c in m.handles:
            # save info in standard Python attributes
            c.http_code = c.getinfo(c.HTTP_CODE)

        # at least in and remove events per socket
        assert len(socket_events) >= 6

        # print result
        for c in m.handles:
            self.assertEqual('success', c.body.getvalue())
            self.assertEqual(200, c.http_code)

            # multi, not curl handle
            self.check(pycurl.POLL_IN, m, socket_events)
            self.check(pycurl.POLL_REMOVE, m, socket_events)

        # close handles
        for c in m.handles:
            # pycurl API calls
            m.remove_handle(c)
            c.close()
        m.close()
Example #20
0
    def __init_curl(self, urls, maxconn=20):
        total = len(urls)
        num_conn = min(total, maxconn)
        m = pycurl.CurlMulti()
        m.handles = []
        for i in range(num_conn):
            c = pycurl.Curl()
            c.html = None
            c = self.__setopt(c)
            m.handles.append(c)

        freelist = m.handles[:]
        num_processed = 0
        while num_processed < total:
            while urls and freelist:
                c = freelist.pop()
                c.k, c.url = urls.pop(0)
                c.html = StringIO.StringIO()
                c.setopt(pycurl.URL, c.url)
                c.setopt(pycurl.WRITEFUNCTION, c.html.write)
                m.add_handle(c)

            while 1:
                ret, num_handles = m.perform()
                if ret != pycurl.E_CALL_MULTI_PERFORM:
                    break

            while 1:
                num, ok, err = m.info_read()
                for c in ok:

                    if len(re.findall("Error [#]2012", c.html.getvalue())) > 0:
                        print " | Patent Errored Out Bro!"
                        return

                    self.sql.c.execute(
                        "INSERT OR REPLACE INTO {tbl} (key, url, html, created) VALUES (?, ?, ?, ?)"
                        .format(tbl=self.SQLtbl),
                        (c.k, c.url, senAdd.uni2asc(
                            c.html.getvalue()), time.time()))
                    c.html.close()
                    c.html = None
                    m.remove_handle(c)
                    freelist.append(c)
##                for c in err:
##                    try:
##                        print "  > error: {key}, {url}".format(key=c.k, url=c.url)
##                    except:
##                        print "  > error"

                sys.stdout.write("{clear}  - {x}".format(clear="\b" * 20,
                                                         x=num_processed))
                num_processed = num_processed + len(ok) + len(err)
                if num_processed % 800 == 0 and num_processed > 0:
                    self.sql.conn.commit()
                if num == 0:
                    break

        for c in m.handles:
            if c.html is not None:
                c.html.close()
                c.html = None
            c.close()
        m.close()
        self.sql.conn.commit()
        print ""
Example #21
0
class DahuaEventThread(threading.Thread):
    """Connects to device and subscribes to events"""
    Devices = []
    NumActivePlayers = 0

    CurlMultiObj = pycurl.CurlMulti()
    NumCurlObjs = 0

    def __init__(self, mqtt, cameras):
        """Construct a thread listening for events."""

        self.basetopic = mqtt["basetopic"]

        self.client = paho.Client("CameraEvents-" + socket.gethostname(),
                                  clean_session=True)
        self.client.on_connect = self.mqtt_on_connect
        self.client.on_disconnect = self.mqtt_on_disconnect
        self.client.message_callback_add(self.basetopic + "/+/picture",
                                         self.mqtt_on_picture_message)
        self.client.message_callback_add(self.basetopic + "/+/alerts",
                                         self.mqtt_on_alert_message)

        self.client.will_set(self.basetopic + "/$online",
                             False,
                             qos=0,
                             retain=True)

        self.alerts = True

        for device_cfg in cameras:

            device = DahuaDevice(device_cfg.get("name"), device_cfg,
                                 self.client, self.basetopic)
            self.Devices.append(device)

            CurlObj = pycurl.Curl()
            device.CurlObj = CurlObj

            CurlObj.setopt(pycurl.URL, device.url)

            CurlObj.setopt(pycurl.CONNECTTIMEOUT, 30)
            CurlObj.setopt(pycurl.TCP_KEEPALIVE, 1)
            CurlObj.setopt(pycurl.TCP_KEEPIDLE, 30)
            CurlObj.setopt(pycurl.TCP_KEEPINTVL, 15)
            if device.auth == 'digest':
                CurlObj.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_DIGEST)
                CurlObj.setopt(pycurl.USERPWD,
                               "%s:%s" % (device.user, device.password))
            else:
                CurlObj.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH)
                CurlObj.setopt(pycurl.USERPWD,
                               "%s:%s" % (device.user, device.password))
            CurlObj.setopt(pycurl.WRITEFUNCTION, device.OnReceive)

            self.CurlMultiObj.add_handle(CurlObj)
            self.NumCurlObjs += 1

            _LOGGER.debug("Added Dahua device at: %s", device.url)

        #connect to mqtt broker

        _LOGGER.debug("Connecting to MQTT Broker")
        self.client.connect(mqtt["IP"], int(mqtt["port"]), 60)

        _LOGGER.debug("Starting MQTT Loop")
        self.client.loop_start()

        threading.Thread.__init__(self)
        self.stopped = threading.Event()

    def run(self):
        heartbeat = 0
        """Fetch events"""
        while 1:
            Ret, NumHandles = self.CurlMultiObj.perform()
            if Ret != pycurl.E_CALL_MULTI_PERFORM:
                break

        Ret = self.CurlMultiObj.select(1.0)
        while not self.stopped.isSet():
            # Sleeps to ease load on processor
            time.sleep(.05)
            heartbeat = heartbeat + 1
            if heartbeat % 1000 == 0:
                _LOGGER.debug("Heartbeat: " + str(datetime.datetime.now()))
                if not self.client.connected_flag:
                    self.client.reconnect()
                self.client.publish(self.basetopic + "/$heartbeat",
                                    str(datetime.datetime.now()))

            Ret, NumHandles = self.CurlMultiObj.perform()

            if NumHandles != self.NumCurlObjs:
                _, Success, Error = self.CurlMultiObj.info_read()

                for CurlObj in Success:
                    DahuaDevice = next(
                        iter(
                            filter(lambda x: x.CurlObj == CurlObj,
                                   self.Devices)), None)
                    if DahuaDevice.Reconnect:
                        _LOGGER.debug("Dahua Reconnect: %s", DahuaDevice.Name)
                        continue

                    DahuaDevice.OnDisconnect("Success")
                    DahuaDevice.Reconnect = time.time() + 5

                for CurlObj, ErrorNo, ErrorStr in Error:
                    DahuaDevice = next(
                        iter(
                            filter(lambda x: x.CurlObj == CurlObj,
                                   self.Devices)), None)
                    if DahuaDevice.Reconnect:
                        continue

                    DahuaDevice.OnDisconnect("{0} ({1})".format(
                        ErrorStr, ErrorNo))
                    DahuaDevice.Reconnect = time.time() + 5

                for DahuaDevice in self.Devices:
                    if DahuaDevice.Reconnect and DahuaDevice.Reconnect < time.time(
                    ):
                        self.CurlMultiObj.remove_handle(DahuaDevice.CurlObj)
                        self.CurlMultiObj.add_handle(DahuaDevice.CurlObj)
                        DahuaDevice.Reconnect = None
            #if Ret != pycurl.E_CALL_MULTI_PERFORM: break

    def mqtt_on_connect(self, client, userdata, flags, rc):
        if rc == 0:
            _LOGGER.info("Connected to MQTT OK Returned code={0}".format(rc))
            self.client.connected_flag = True
            self.client.publish(self.basetopic + "/$online",
                                True,
                                qos=0,
                                retain=True)
            self.client.publish(self.basetopic + "/$version", version)
            if self.alerts:
                state = "ON"
            else:
                state = "OFF"

            for device in self.Devices:
                device.alerts = state
                self.client.publish(
                    self.basetopic + "/" + device.Name + "/alerts/state",
                    state)
            #self.client.subscribe(self.basetopic +"/#")
            #self.client.subscribe("CameraEventsPy/alerts")

        else:
            _LOGGER.info(
                "Camera : {0}: Bad mqtt connection Returned code={1}".format(
                    "self.Name", rc))
            self.client.connected_flag = False

    def mqtt_on_disconnect(self, client, userdata, rc):
        logging.info("disconnecting reason  " + str(rc))
        self.client.connected_flag = False

    def mqtt_on_picture_message(self, client, userdata, msg):

        #if msg.payload.decode() == "Hello world!":
        _LOGGER.info("Picture Msg Received: Topic:{0} Payload:{1}".format(
            msg.topic, msg.payload))
        msgchannel = msg.topic.split("/")[1]
        for device in self.Devices:
            channel = device.channelIsMine(msgchannel)
            if channel > -1:
                _LOGGER.debug(
                    "Found Camera: {0} channel: {1}: Name:{2}".format(
                        device.Name, channel, device.channels[channel]))
                device.SnapshotImage(channel + device.snapshotoffset,
                                     msgchannel, "Snap Shot Image")
                break

    def mqtt_on_alert_message(self, client, userdata, msg):
        if msg.payload == 'ON':
            newState = True
        else:
            newState = False

        deviceName = msg.topic.split('/')[1]
        _LOGGER.info("Camera: {0}: Msg Received: Topic:{1} Payload:{2}".format(
            deviceName, msg.topic, msg.payload))
        for device in self.Devices:
            #channel = self.Devices[device].channelIsMine("Garage")
            if device.Name == deviceName:
                device.alerts = newState
                _LOGGER.info("Turning Alerts {0}".format(newState))
                self.client.publish(
                    self.basetopic + "/" + device.Name + "/alerts/state",
                    msg.payload)

    def mqtt_on_cross_message(self, client, userdata, msg):
        if msg.payload == 'ON':
            newState = True
        else:
            newState = False

        deviceName = msg.topic.split('/')[1]
        _LOGGER.info("Camera: {0}: Msg Received: Topic:{1} Payload:{2}".format(
            deviceName, msg.topic, msg.payload))
        for device in self.Devices:
            #channel = self.Devices[device].channelIsMine("Garage")
            if device.Name == deviceName:
                device.alerts = newState
                _LOGGER.info("Turning Alerts {0}".format(newState))
                self.client.publish(
                    self.basetopic + "/" + device.Name + "/alerts/state",
                    msg.payload)
    def test_multi_timer(self):
        urls = [
            'http://localhost:8380/success',
            'http://localhost:8381/success',
            'http://localhost:8382/success',
        ]

        timers = []
        
        # timer callback
        def timer(msecs):
            #print('Timer callback msecs:', msecs)
            timers.append(msecs)

        # init
        m = pycurl.CurlMulti()
        m.setopt(pycurl.M_TIMERFUNCTION, timer)
        m.handles = []
        for url in urls:
            c = pycurl.Curl()
            # save info in standard Python attributes
            c.url = url
            c.body = util.BytesIO()
            c.http_code = -1
            m.handles.append(c)
            # pycurl API calls
            c.setopt(c.URL, c.url)
            c.setopt(c.WRITEFUNCTION, c.body.write)
            m.add_handle(c)

        # get data
        num_handles = len(m.handles)
        while num_handles:
            while 1:
                ret, num_handles = m.perform()
                if ret != pycurl.E_CALL_MULTI_PERFORM:
                    break
                # currently no more I/O is pending, could do something in the meantime
                # (display a progress bar, etc.)
                m.select(1.0)

        for c in m.handles:
            # save info in standard Python attributes
            c.http_code = c.getinfo(c.HTTP_CODE)

        # print result
        for c in m.handles:
            self.assertEqual('success', c.body.getvalue().decode())
            self.assertEqual(200, c.http_code)
        
        assert len(timers) > 0
        # libcurl 7.23.0 produces a 0 timer
        assert timers[0] >= 0
        # this assertion does not appear to hold on older libcurls
        # or apparently on any linuxes, see
        # https://github.com/p/pycurl/issues/19
        #if not util.pycurl_version_less_than(7, 24):
        #    self.assertEqual(-1, timers[-1])

        # close handles
        for c in m.handles:
            # pycurl API calls
            m.remove_handle(c)
            c.close()
        m.close()
Example #23
0
def multi_get(urls,
              num_conn,
              timeout,
              err_callback,
              succ_callback,
              ua='semRushBot',
              percentile=100):
    result = {}
    queue = deque(list(urls))
    cur_percentile = 0
    print_percentile = 0

    if not queue: return

    num_urls = len(queue)
    num_conn = min(num_conn, num_urls)

    assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
    assert 1 <= percentile <= 100, "invalid percentile"

    logging.debug("PycURL %s (compiled against 0x%x)" %
                  (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM))

    m = pycurl.CurlMulti()
    m.handles = []
    for i in range(num_conn):
        c = pycurl.Curl()
        c.fp = None
        c.setopt(pycurl.FOLLOWLOCATION, 1)
        c.setopt(pycurl.MAXREDIRS, 1)
        c.setopt(pycurl.CONNECTTIMEOUT, timeout)
        c.setopt(pycurl.TIMEOUT, timeout)
        c.setopt(pycurl.NOSIGNAL, 1)
        c.setopt(pycurl.USERAGENT, ua)

        m.handles.append(c)

    freelist = m.handles[:]
    num_processed = 0
    bailout = 0
    while num_processed < num_urls:
        if bailout:
            break

        while queue and freelist:
            c = freelist.pop()
            c.props = queue.popleft()

            if type(c.props['url']) == type(u''):
                c.props['url'] = c.props['url'].encode('utf8', 'replace')

            c.setopt(pycurl.URL, c.props['url'])

            try:
                c.setopt(pycurl.COOKIE, str(c.props['cookie']))
            except KeyError:
                pass

            c.source = cStringIO.StringIO()
            c.header = cStringIO.StringIO()

            c.setopt(pycurl.HEADERFUNCTION, c.header.write)
            c.setopt(pycurl.WRITEFUNCTION, c.source.write)

            m.add_handle(c)

        while 1:
            ret, num_handles = m.perform()
            if ret != pycurl.E_CALL_MULTI_PERFORM:
                break

        while 1:
            num_q, ok_list, err_list = m.info_read()

            for c in ok_list:
                c.fp = None
                m.remove_handle(c)
                logging.debug("[ ok] %s" % (c.props['url']))
                succ_callback(c)
                freelist.append(c)

            for c, errno, errmsg in err_list:
                c.fp = None
                m.remove_handle(c)
                logging.debug("[err] %s %s" % (c.props['url'], errmsg))
                err_callback(c)
                freelist.append(c)

            num_processed = num_processed + len(ok_list) + len(err_list)

            if num_urls:
                cur_percentile = round(float(num_processed) / num_urls * 100)
                if cur_percentile % 10 == 0 and 0 < cur_percentile < 100 and print_percentile != cur_percentile:
                    logging.info("%d%%" % cur_percentile)
                    print_percentile = cur_percentile

                if (cur_percentile >= percentile):
                    bailout = 1
                    break
            if num_q == 0:
                break
        m.select(1.0)

    m.close()

    return result
Example #24
0
    def __init__(
        self,
        url=None,
        username=None,
        password=None,
        token=None,
        insecure=False,
        ca_file=None,
        debug=False,
        log=None,
        kerberos=False,
        timeout=0,
        compress=True,
        sso_url=None,
        sso_revoke_url=None,
        sso_token_name='access_token',
        headers=None,
        pipeline=0,
        connections=0,
    ):
        """
        Creates a new connection to the API server.

        This method supports the following parameters:

        `url`:: A string containing the base URL of the server, usually
        something like `https://server.example.com/ovirt-engine/api`.

        `username`:: The name of the user, something like `admin@internal`.

        `password`:: The name password of the user.

        `token`:: : The token to be used to access API. Optionally, user can
        use token, instead of username and password to access API. If user
        don't specify `token` parameter, SDK will automatically create one.

        `insecure`:: A boolean flag that indicates if the server TLS
        certificate and host name should be checked.

        `ca_file`:: A PEM file containing the trusted CA certificates. The
        certificate presented by the server will be verified using these CA
        certificates. If `ca_file` parameter is not set, system wide
        CA certificate store is used.

        `debug`:: A boolean flag indicating if debug output should be
        generated. If the value is `True` and the `log` parameter isn't
        `None` then the data sent to and received from the server will
        be written to the log. Be aware that user names and passwords will
        also be written, so handle it with care.

        `log`:: The logger where the log messages will be written.

        `kerberos`:: A boolean flag indicating if Kerberos
        authentication should be used instead of the default basic
        authentication.

        `timeout`:: The maximum total time to wait for the response, in
        seconds. A value of zero (the default) means wait for ever. If
        the timeout expires before the response is received an exception
        will be raised.

        `compress`:: A boolean flag indicating if the SDK should ask
        the server to send compressed responses. The default is `True`.
        Note that this is a hint for the server, and that it may return
        uncompressed data even when this parameter is set to `True`.
        Note that compression will be disabled if user pass `debug`
        parameter set to `true`, so the debug messages are in plain text.

        `sso_url`:: A string containing the base SSO URL of the serve.
        Default SSO url is computed from the `url` if no `sso_url` is provided.

        `sso_revoke_url`:: A string containing the base URL of the SSO
        revoke service. This needs to be specified only when using
        an external authentication service. By default this URL
        is automatically calculated from the value of the `url` parameter,
        so that SSO token revoke will be performed using the SSO service
        that is part of the engine.

        `sso_token_name`:: The token name in the JSON SSO response returned
        from the SSO server. Default value is `access_token`.

        `headers`:: A dictionary with headers which should be send with every
        request.

        `connections`:: The maximum number of connections to open to the host.
        If the value is `0` (the default) then the number of connections will
        be unlimited.

        `pipeline`:: The maximum number of request to put in an HTTP pipeline
        without waiting for the response. If the value is `0` (the default)
        then pipelining is disabled.
        """

        # Check mandatory parameters:
        if url is None:
            raise Error('The \'url\' parameter is mandatory')

        # Check that the CA file exists if insecure is not set:
        if not insecure:
            if ca_file is not None and not os.path.exists(ca_file):
                raise Error('The CA file \'%s\' doesn\'t exist' % ca_file)

        # Save the URL:
        self._url = url

        # Save the logger:
        self._log = log

        # Save the credentials:
        self._username = username
        self._password = password
        self._sso_token = token
        self._kerberos = kerberos
        self._ca_file = ca_file
        self._insecure = insecure
        self._timeout = timeout
        self._debug = debug
        self._compress = compress

        # The curl object can be used by several threads, but not
        # simultaneously, so we need a lock to prevent that:
        self._curl_lock = threading.Lock()

        # Set SSO attributes:
        self._sso_url = sso_url
        self._sso_revoke_url = sso_revoke_url
        self._sso_token_name = sso_token_name

        # Headers:
        self._headers = headers or {}

        # Create the curl handle that manages the pool of connections:
        self._multi = pycurl.CurlMulti()
        self._multi.setopt(pycurl.M_PIPELINING, bool(pipeline))
        # Since libcurl 7.30.0:
        if hasattr(pycurl, 'M_MAX_PIPELINE_LENGTH'):
            self._multi.setopt(pycurl.M_MAX_PIPELINE_LENGTH, pipeline)
            self._multi.setopt(pycurl.M_MAX_HOST_CONNECTIONS, connections)

        # Connections:
        self._curls = set()

        # Initialize the reference to the system service:
        self.__system_service = None
Example #25
0
    def retrieve(self):
        if self.remote_inst_id is None:
            self.target_queue.append(RootTarget(self, self.endpoint_root))
            num_conn = 1
            full = True
        else:
            num_conn = self.get_cand_num_conn()
            if not num_conn:
                return False

            full = num_conn >= self.own_max_num_conn
            if full:
                num_conn = self.own_max_num_conn

        m = pycurl.CurlMulti()
        m.handles = []
        for i in range(num_conn):
            c = pycurl.Curl()
            c.setopt(pycurl.CONNECTTIMEOUT, 30)
            c.setopt(pycurl.TIMEOUT, 300)
            m.handles.append(c)

        freelist = m.handles[:]
        num_started = 0
        num_processed = 0
        while True:
            target = None
            if freelist:
                target = self.pop_target()

            while target:
                assert freelist

                c = freelist.pop()
                num_started += 1
                if target.get_verb() == 'DELETE':
                    c.setopt(pycurl.CUSTOMREQUEST, 'DELETE')
                else: # reset after potential DELETE
                    c.unsetopt(pycurl.CUSTOMREQUEST)
                    c.setopt(pycurl.HTTPGET, True)

                c.setopt(pycurl.URL, target.url)
                c.target = target

                if target.has_plaintext_body():
                    c.setopt(pycurl.ENCODING, b'gzip')

                c.setopt(c.HEADERFUNCTION, target.handle_header)
                c.setopt(pycurl.WRITEDATA, target)
                m.add_handle(c)

                if freelist:
                    target = self.pop_target()
                else:
                    target = None

            while True:
                ret, num_handles = m.perform()
                if ret != pycurl.E_CALL_MULTI_PERFORM:
                    break

            while True:
                num_q, ok_list, err_list = m.info_read()
                for c in ok_list:
                    target = c.target
                    m.remove_handle(c)
                    msg_verb = "deleted" if target.get_verb() == 'DELETE' else "got"
                    eff_url = c.getinfo(pycurl.EFFECTIVE_URL)
                    msg = msg_verb + " " + eff_url
                    if not target.succeeded():
                        if target.http_code is None:
                            msg += " with no HTTP status"
                        else:
                            msg += " with %d" % target.http_code

                    print(msg, file=sys.stderr)

                    target.close()
                    c.target = None
                    freelist.append(c)

                for c, errno, errmsg in err_list:
                    target = c.target
                    target.close()
                    c.target = None
                    m.remove_handle(c)
                    self.report_error(target, errno, errmsg)
                    freelist.append(c)

                num_processed += len(ok_list) + len(err_list)

                if self.healthcheck_interval and (self.total_processed >= self.total_checked + self.healthcheck_interval):
                    self.healthcheck()

                if num_q == 0:
                    break

            if num_started == num_processed:
                break

            m.select(1.0)

        for c in m.handles:
            if hasattr(c, 'target') and (c.target is not None):
                c.target.close()
                c.target = None

            c.close()

        m.close()
        return full or len(self.target_queue)
Example #26
0
def load_urls(collection, shape=(8, 256, 256), max_retries=MAX_RETRIES):
    sockets = set()
    socket_events = []
    timeout = 0

    def _fsocket(event, socket, multi, data):
        if event == pycurl.POLL_REMOVE:
            sockets.remove(socket)
        else:
            if socket not in sockets:
                sockets.add(socket)
        socket_events.append((event, multi))

    mc = pycurl.CurlMulti()
    mc.setopt(pycurl.M_SOCKETFUNCTION, _fsocket)
    #mc.setopt(pycurl.M_PIPELINING, True)
    nhandles = len(collection)
    results, cmap = {}, {}
    for url, token, index in collection:
        index = tuple(index)
        _curl, fp = _setup_curl(url, token, index)
        cmap[index] = (_curl, fp)
        mc.add_handle(_curl)

    while (pycurl.E_CALL_MULTI_PERFORM == mc.socket_all()[0]):
        pass

    timeout = mc.timeout()

    nprocessed = 0
    while timeout >= 0:
        (rr, wr, er) = select.select(sockets, sockets, sockets,
                                     timeout / 1000.0)
        socketSet = set(rr + wr + er)
        if socketSet:
            for s in socketSet:
                while True:
                    (ret, running) = mc.socket_all()
                    if ret != pycurl.E_CALL_MULTI_PERFORM:
                        break
                nq, suc, failed = mc.info_read()
                nprocessed += len(suc)
                for h in suc:
                    _fp = cmap[h.index][-1]
                    _fp.flush()
                    _fp.close()
                    try:
                        arr = imread(_fp.name)
                        if len(arr.shape) == 3:
                            arr = np.rollaxis(arr, 2, 0)
                        else:
                            arr = np.expand_dims(arr, axis=0)
                    except Exception as e:
                        print(e)
                        arr = np.zeros(shape, dtype=np.float32)
                    finally:
                        results[h.index] = arr
                        h.close()
                        mc.remove_handle(h)
                        os.remove(_fp.name)
                for h, err_num, err_msg in failed:
                    print('failed: {}, code={}, msg={}'.format(
                        h.index, err_num, err_msg))
                    _fp = cmap[h.index][-1]
                    _fp.flush()
                    _fp.close()
                    os.remove(_fp.name)
                    h.close()
                    mc.remove_handle(h)
                    _curl, fp = _setup_curl(h.url, h.token, h.index)
                    cmap[h.index] = (_curl, fp)
                    mc.add_handle(_curl)

        else:
            (ret, running) = mc.socket_action(pycurl.SOCKET_TIMEOUT, 0)
        if running == 0:
            break
    mc.close()
    return results
Example #27
0
    if not url or url[0] == "#":
        continue
    filename = "doc_%03d.dat" % (len(queue) + 1)
    queue.append((url, filename))

# Check args
assert queue, "no URLs given"
num_urls = len(queue)
num_conn = min(num_conn, num_urls)
assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
print("PycURL %s (compiled against 0x%x)" %
      (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM))
print("----- Getting", num_urls, "URLs using", num_conn, "connections -----")

# Pre-allocate a list of curl objects
m = pycurl.CurlMulti()
m.handles = []
for i in range(num_conn):
    c = pycurl.Curl()
    c.fp = None
    c.setopt(pycurl.FOLLOWLOCATION, 1)
    c.setopt(pycurl.MAXREDIRS, 5)
    c.setopt(pycurl.CONNECTTIMEOUT, 30)
    c.setopt(pycurl.TIMEOUT, 300)
    c.setopt(pycurl.NOSIGNAL, 1)
    m.handles.append(c)

# Main loop
freelist = m.handles[:]
num_processed = 0
while num_processed < num_urls:
class DahuaEventThread(threading.Thread):
    """Connects to device and subscribes to events"""
    Devices = []
    NumActivePlayers = 0

    CurlMultiObj = pycurl.CurlMulti()
    NumCurlObjs = 0

    def __init__(self, hass, config):
        """Construct a thread listening for events."""
        self.hass = hass

        for device_cfg in config:
            url = URL_TEMPLATE.format(protocol=device_cfg.get("protocol"),
                                      host=device_cfg.get("host"),
                                      port=device_cfg.get("port"),
                                      events=device_cfg.get("events"))
            channels = device_cfg.get("channels")
            channels_dict = {}
            if channels is not None:
                for channel in channels:
                    channels_dict[channel.get("number")] = channel.get("name")

            device = DahuaDevice(self, hass, device_cfg.get("name"), url,
                                 channels_dict)
            self.Devices.append(device)

            CurlObj = pycurl.Curl()
            device.CurlObj = CurlObj

            CurlObj.setopt(pycurl.URL, url)
            CurlObj.setopt(pycurl.CONNECTTIMEOUT, 30)
            CurlObj.setopt(pycurl.TCP_KEEPALIVE, 1)
            CurlObj.setopt(pycurl.TCP_KEEPIDLE, 30)
            CurlObj.setopt(pycurl.TCP_KEEPINTVL, 15)
            CurlObj.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_DIGEST)
            CurlObj.setopt(
                pycurl.USERPWD,
                "%s:%s" % (device_cfg.get("user"), device_cfg.get("password")))
            CurlObj.setopt(pycurl.WRITEFUNCTION, device.OnReceive)

            self.CurlMultiObj.add_handle(CurlObj)
            self.NumCurlObjs += 1

            _LOGGER.debug("Added Dahua device at: %s", url)

        threading.Thread.__init__(self)
        self.stopped = threading.Event()

    def run(self):
        """Fetch events"""
        while 1:
            Ret, NumHandles = self.CurlMultiObj.perform()
            if Ret != pycurl.E_CALL_MULTI_PERFORM:
                break

        Ret = self.CurlMultiObj.select(1.0)
        while not self.stopped.isSet():
            # Sleeps to ease load on processor
            time.sleep(.05)
            Ret, NumHandles = self.CurlMultiObj.perform()

            if NumHandles != self.NumCurlObjs:
                _, Success, Error = self.CurlMultiObj.info_read()

                for CurlObj in Success:
                    DahuaDevice = next(
                        filter(lambda x: x.CurlObj == CurlObj, self.Devices))
                    if DahuaDevice.Reconnect:
                        continue

                    DahuaDevice.OnDisconnect("Success")
                    DahuaDevice.Reconnect = time.time() + 5

                for CurlObj, ErrorNo, ErrorStr in Error:
                    DahuaDevice = next(
                        filter(lambda x: x.CurlObj == CurlObj, self.Devices))
                    if DahuaDevice.Reconnect:
                        continue

                    DahuaDevice.OnDisconnect("{0} ({1})".format(
                        ErrorStr, ErrorNo))
                    DahuaDevice.Reconnect = time.time() + 5

                for DahuaDevice in self.Devices:
                    if DahuaDevice.Reconnect and DahuaDevice.Reconnect < time.time(
                    ):
                        self.CurlMultiObj.remove_handle(DahuaDevice.CurlObj)
                        self.CurlMultiObj.add_handle(DahuaDevice.CurlObj)
                        DahuaDevice.Reconnect = None
Example #29
0
def getdata(urls, ckey, cert, headers=None, options=None, num_conn=50, cookie=None):
    """
    Get data for given list of urls, using provided number of connections
    and user credentials
    """

    if not options:
        options = pycurl_options()

    # Make a queue with urls
    queue = [u for u in urls if validate_url(u)]

    # Check args
    num_urls = len(queue)
    num_conn = min(num_conn, num_urls)

    # Pre-allocate a list of curl objects
    mcurl = pycurl.CurlMulti()
    mcurl.handles = []
    for _ in range(num_conn):
        curl = pycurl.Curl()
        curl.fp = None
        for key, val in viewitems(options):
            curl.setopt(getattr(pycurl, key), val)
        curl.setopt(pycurl.SSLKEY, ckey)
        curl.setopt(pycurl.SSLCERT, cert)
        mcurl.handles.append(curl)
        if headers:
            curl.setopt(pycurl.HTTPHEADER, \
                        ["%s: %s" % (k, v) for k, v in viewitems(headers)])

    # Main loop
    freelist = mcurl.handles[:]
    num_processed = 0
    while num_processed < num_urls:
        # If there is an url to process and a free curl object,
        # add to multi-stack
        while queue and freelist:
            url = queue.pop(0)
            curl = freelist.pop()
            curl.setopt(pycurl.URL, url.encode('ascii', 'ignore'))
            if cookie and url in cookie:
                curl.setopt(pycurl.COOKIEFILE, cookie[url])
                curl.setopt(pycurl.COOKIEJAR, cookie[url])
            bbuf = BytesIO()
            hbuf = BytesIO()
            curl.setopt(pycurl.WRITEFUNCTION, bbuf.write)
            curl.setopt(pycurl.HEADERFUNCTION, hbuf.write)
            mcurl.add_handle(curl)
            # store some info
            curl.hbuf = hbuf
            curl.bbuf = bbuf
            curl.url = url
        # Run the internal curl state machine for the multi stack
        while True:
            ret, _ = mcurl.perform()
            if ret != pycurl.E_CALL_MULTI_PERFORM:
                break
        # Check for curl objects which have terminated, and add them to the
        # freelist
        while True:
            num_q, ok_list, err_list = mcurl.info_read()
            for curl in ok_list:
                if sys.version.startswith('3.'):
                    hdrs = curl.hbuf.getvalue().decode('utf-8')
                    data = curl.bbuf.getvalue().decode('utf-8')
                else:
                    hdrs = curl.hbuf.getvalue()
                    data = curl.bbuf.getvalue()
                url = curl.url
                curl.bbuf.flush()
                curl.bbuf.close()
                curl.hbuf.close()
                curl.hbuf = None
                curl.bbuf = None
                mcurl.remove_handle(curl)
                freelist.append(curl)
                yield {'url': url, 'data': data, 'headers': hdrs}
            for curl, errno, errmsg in err_list:
                hdrs = curl.hbuf.getvalue()
                data = curl.bbuf.getvalue()
                url = curl.url
                curl.bbuf.flush()
                curl.bbuf.close()
                curl.hbuf.close()
                curl.hbuf = None
                curl.bbuf = None
                mcurl.remove_handle(curl)
                freelist.append(curl)
                yield {'url': url, 'data': None, 'headers': hdrs, \
                       'error': errmsg, 'code': errno}
            num_processed = num_processed + len(ok_list) + len(err_list)
            if num_q == 0:
                break
        # Currently no more I/O is pending, could do something in the meantime
        # (display a progress bar, etc.).
        # We just call select() to sleep until some more data is available.
        mcurl.select(1.0)

    cleanup(mcurl)
Example #30
0
 def test_multi_close_twice(self):
     m = pycurl.CurlMulti()
     m.close()
     m.close()