def test_positional_arguments(self): pycurl.CurlMulti(1)
def test_remove_invalid_closed_handle(self): m = pycurl.CurlMulti() c = util.DefaultCurl() c.close() m.remove_handle(c) del m, c
def fetch(self, req_url, output=None): """ Fetch a file. Parameters ---------- req_url : str URL of the file to retrieve output : str, optional filename, possibly with path, of the downloaded file. TODO: test can_segment == false """ (eurl, size, can_segment) = _check_headers(req_url) if output is None: output = os.path.split(eurl)[1] if len(output) < 1: raise RuntimeError("Output file must be provided if URL points " "to a directory.") LOG.info('Downloading %s, (%d bytes)' % (output, size)) segments = self._get_segments(size, can_segment) # allocate file space afile = open(output, str('wb')) if size > 0: afile.truncate(size) afile.close() out_file = open(output, str('r+b')) connections = [] for i in range(len(segments)): c = Connection(eurl, can_segment) connections.append(c) con = { 'connections': connections, 'free': connections[:], 'working': [] } start_time = time.time() elapsed = None mcurl = pycurl.CurlMulti() while True: while segments and con['free']: p = segments.pop(0) c = con['free'].pop(0) c.prepare(out_file, p) con['working'].append(c) mcurl.add_handle(c.curl) LOG.debug('%s:Start downloading', c.name) while True: ret, handles_num = mcurl.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break while True: num_q, ok_list, err_list = mcurl.info_read() for curl in ok_list: curl.errno = pycurl.E_OK mcurl.remove_handle(curl) c = curl.connection con['working'].remove(c) c.errno = curl.errno c.errmsg = None c.code = curl.getinfo(pycurl.RESPONSE_CODE) if c.code in STATUS_OK: LOG.info('%s: Download successed. (%d/%d)', c.name, c.segment_downloaded, c.segment_size) con['free'].append(c) elif c.code in STATUS_ERROR: msg = '%s:Error < %d >! Connection will be closed' LOG.error(msg.format(c.name, c.code)) con['connections'].remove(c) c.close() segments.append(c.segment) new_c = Connection(c.getopt(pycurl.URL)) con['connections'].append(new_c) con['free'].append(new_c) else: msg = '%s: Unhandled http status code %d' raise Exception(msg.format(c.name, c.code)) for curl, errno, errmsg in err_list: curl.errno = errno curl.errmsg = errmsg mcurl.remove_handle(curl) c = curl.connection c.errno = curl.errno c.errmsg = curl.errmsg con['working'].remove(c) msg = '%s:Download failed < %s >' LOG.error(msg, c.name, c.errmsg) if c.can_segment and c.retried < self.max_retry: c.prepare_retry() con['working'].append(c) mcurl.add_handle(c.curl) LOG.error('%s:Try again', c.name) else: raise RuntimeError(c.errmsg) if num_q == 0: break elapsed = time.time() - start_time downloaded = sum( [connection.total_downloaded for connection in connections]) _show_progress(size, downloaded, elapsed) if not con['working']: break mcurl.select(1.0) msg = 'Download Succeeded! Total Elapsed %ds'.format(elapsed) LOG.info(msg)
def fireUp(target_list, num_conn, proxy_addr, proxy_port): tuples = target_list try: import signal from signal import SIGPIPE, SIG_IGN signal.signal(signal.SIGPIPE, signal.SIG_IGN) except ImportError: pass queue = [] for tuple in tuples: for url in tuple: if not url or url[0] == "#": url = str(url).strip() continue cb = callback() queue.append((url, cb)) num_urls = len(queue) num_conn = min(num_conn, num_urls) assert 1 <= num_conn <= 10000, "invalid number of concurrent connections" m = pycurl.CurlMulti() m.handles = [] for i in range(num_conn): c = pycurl.Curl() c.setopt(pycurl.FOLLOWLOCATION, 1) c.setopt(pycurl.MAXREDIRS, 5) c.setopt(pycurl.CONNECTTIMEOUT, 30) c.setopt(pycurl.TIMEOUT, 300) c.setopt(pycurl.NOSIGNAL, 1) c.setopt( pycurl.USERAGENT, """Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)""" ) c.setopt(pycurl.HTTPHEADER, [ """Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5""", """Accept-Language: en-us,en;q=0.5""", """Accept-Encoding: gzip,deflate""", """Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7""", """Keep-Alive: 300""", """Connection: keep-alive""" ]) if proxy_addr != '': c.setopt(pycurl.PROXY, proxy_addr) c.setopt(pycurl.PROXYPORT, proxy_port) m.handles.append(c) freelist = m.handles[:] num_processed = 0 while num_processed < num_urls: while queue and freelist: url, cb = queue.pop(0) c = freelist.pop() c.setopt(pycurl.URL, url) c.setopt(pycurl.WRITEFUNCTION, cb.feed) m.add_handle(c) c.url = url while 1: ret, num_handles = m.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break while 1: num_q, ok_list, err_list = m.info_read() for c in ok_list: m.remove_handle(c) freelist.append(c) for c, errno, errmsg in err_list: m.remove_handle(c) freelist.append(c) num_processed = num_processed + len(ok_list) + len(err_list) if num_q == 0: break m.select(1.0) for c in m.handles: c.close() m.close()
def crawl(urls, sbook, fbook, num_conn=500): success_count = 0 failure_count = 0 start_time = time.time() import sys import pycurl # We should ignore SIGPIPE when using pycurl.NOSIGNAL try: import signal from signal import SIGPIPE, SIG_IGN signal.signal(signal.SIGPIPE, signal.SIG_IGN) except ImportError: pass # Make a queue with (url, filename) tuplev").hes queue = [] for url in urls: url = url.strip() if not url or url[0] == "#": continue filename = str(md5.new(url).hexdigest()) + ".uss" queue.append((url, filename)) # Check args assert queue, "no URLs given" num_urls = len(queue) num_conn = min(num_conn, num_urls) assert 1 <= num_conn <= 10000, "invalid number of concurrent connections" print "I got ", num_urls, " URLs to process.. ." # Pre-allocate a list of curl objects m = pycurl.CurlMulti() m.handles = [] for i in range(num_conn): c = pycurl.Curl() c.fp = None c.setopt(pycurl.FOLLOWLOCATION, 1) c.setopt(pycurl.MAXREDIRS, 3) c.setopt(pycurl.CONNECTTIMEOUT, 60) c.setopt(pycurl.TIMEOUT, 300) c.setopt(pycurl.LOW_SPEED_LIMIT, 0) c.setopt(pycurl.LOW_SPEED_TIME, 0) c.setopt(pycurl.NOSIGNAL, 1) m.handles.append(c) # Main loop freelist = m.handles[:] num_processed = 0 while num_processed < num_urls: # If there is an url to process and a free curl object, add to multi stack while queue and freelist: url, filename = queue.pop(0) c = freelist.pop() c.fp = open(filename, "wb") c.setopt(pycurl.URL, url) c.setopt(pycurl.WRITEDATA, c.fp) m.add_handle(c) # store some info c.filename = filename c.url = url # Run the internal curl state machine for the multi stack while 1: ret, num_handles = m.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break # Check for curl objects which have terminated, and add them to the freelist while 1: num_q, ok_list, err_list = m.info_read() for c in ok_list: c.fp.close() c.fp = None m.remove_handle(c) success_count += 1 pattern = "-->" + str( c.filename) + ":::" + str(c.url) + ":::" + str( c.getinfo(pycurl.EFFECTIVE_URL)) + chr(10) sbook.write(pattern) sbook.flush() freelist.append(c) for c, errno, errmsg in err_list: c.fp.close() c.fp = None m.remove_handle(c) failure_count += 1 pattern = "-->" + str(c.filename) + ":::" + str( c.url) + ":::" + str(errno) + ":::" + str(errmsg) + chr(10) fbook.write(pattern) fbook.flush() freelist.append(c) num_processed = num_processed + len(ok_list) + len(err_list) if num_q == 0: break msg = "Total Processed:" + str(num_processed) + ", Ok:" + str( success_count) + ", Not Ok:" + str( failure_count) + ", Time:" + str(time.time() - start_time) #sys.stdout.write("\r"+str(msg)) #sys.stdout.flush() # Currently no more I/O is pending, could do something in the meantime # (display a progress bar, etc.). # We just call select() to sleep until some more data is available. m.select(1.0) # Cleanup for c in m.handles: if c.fp is not None: c.fp.close() c.fp = None c.close() m.close() return num_processed, success_count, failure_count
def fetch(self): """ Download urls via multicurl. Get new tasks from queue. """ m = pycurl.CurlMulti() m.handles = [] # Create curl instances for x in xrange(self.thread_number): curl = pycurl.Curl() m.handles.append(curl) freelist = m.handles[:] # This is infinite cycle # You can break it only from outside code which # iterates over result of this method while True: cached_request = None while len(freelist): # Increase request counter if (self.request_limit is not None and self.counters['request'] >= self.request_limit): logging.debug('Request limit is reached: %s' %\ self.request_limit) if len(freelist) == self.thread_number: yield None else: break else: try: priority, task = self.taskq.get(True, 0.1) except Empty: # If All handlers are free and no tasks in queue # yield None signal if len(freelist) == self.thread_number: yield None else: break else: if not self._preprocess_task(task): continue task.network_try_count += 1 if task.task_try_count == 0: task.task_try_count = 1 if task.task_try_count > self.task_try_limit: logging.debug('Task tries ended: %s / %s' % (task.name, task.url)) self.add_item('too-many-task-tries', task.url) continue if task.network_try_count > self.network_try_limit: logging.debug('Network tries ended: %s / %s' % (task.name, task.url)) self.add_item('too-many-network-tries', task.url) continue #import pdb; pdb.set_trace() if task.grab: grab = task.grab else: # Set up curl instance via Grab interface grab = Grab(**self.grab_config) grab.setup(url=task.url) if self.use_cache and not task.get('disable_cache'): if grab.detect_request_method() == 'GET': url = grab.config['url'] cache_item = self.cache.find_one({'_id': url}) if cache_item: #if url in self.cache: #cache_item = pickle.loads(self.cache[url]) #logging.debug('From cache: %s' % url) cached_request = (grab, grab.clone(), task, cache_item) grab.prepare_request() self.inc_count('request-cache') # break from prepre-request cycle # and go to process-response code break self.inc_count('request-network') if self.proxylist_config: args, kwargs = self.proxylist_config grab.setup_proxylist(*args, **kwargs) curl = freelist.pop() curl.grab = grab curl.grab.curl = curl curl.grab_original = grab.clone() curl.grab.prepare_request() curl.task = task # Add configured curl instance to multi-curl processor m.add_handle(curl) # If there were done network requests if len(freelist) != self.thread_number: while True: status, active_objects = m.perform() if status != pycurl.E_CALL_MULTI_PERFORM: break if cached_request: grab, grab_original, task, cache_item = cached_request url = task.url # or grab.config['url'] grab.fake_response(cache_item['body']) def custom_prepare_response(g): g.response.head = cache_item['head'].encode('utf-8') g.response.body = cache_item['body'].encode('utf-8') g.response.code = cache_item['response_code'] g.response.time = 0 g.response.url = cache_item['url'] g.response.parse('utf-8') g.response.cookies = g.extract_cookies() grab.process_request_result(custom_prepare_response) yield { 'ok': True, 'grab': grab, 'grab_original': grab_original, 'task': task, 'ecode': None, 'emsg': None } self.inc_count('request') while True: queued_messages, ok_list, fail_list = m.info_read() results = [] for curl in ok_list: results.append((True, curl, None, None)) for curl, ecode, emsg in fail_list: results.append((False, curl, ecode, emsg)) for ok, curl, ecode, emsg in results: res = self.process_multicurl_response( ok, curl, ecode, emsg) m.remove_handle(curl) freelist.append(curl) yield res self.inc_count('request') if not queued_messages: break m.select(0.5)
def setUp(self): super(MultiOptionConstantsTest, self).setUp() self.m = pycurl.CurlMulti()
def __init__(self, url, method, data=None, kerberos_auth=False, allow_redirects=True, verify_ssl=True, ca=None, use_json=False, headers=None, stream=False, username=None, password=None, client_cert=None, client_key=None, verbose=False): self.finished = False # have we read all data? self.closed = False # have we destroyed curl resources? self.status_code = 0 self.headers = None self.response_buffer = BytesIO() self.headers_buffer = BytesIO() self.response_decoder = None self.url = url headers = headers or {} method = method.lower() self.c = pycurl.Curl() self.curl_multi = pycurl.CurlMulti() if method == 'post': self.c.setopt(pycurl.POST, 1) headers["Expect"] = "" # openshift can't handle Expect elif method == 'get': self.c.setopt(pycurl.HTTPGET, 1) elif method == 'put': # self.c.setopt(pycurl.PUT, 1) self.c.setopt(pycurl.CUSTOMREQUEST, b"PUT") headers["Expect"] = "" elif method == 'delete': self.c.setopt(pycurl.CUSTOMREQUEST, b"DELETE") else: raise RuntimeError("Unsupported method '%s' for curl call!" % method) self.c.setopt(pycurl.COOKIEFILE, b'') self.c.setopt(pycurl.URL, str(url)) self.c.setopt(pycurl.WRITEFUNCTION, self.response_buffer.write) self.c.setopt(pycurl.HEADERFUNCTION, self.headers_buffer.write) self.c.setopt(pycurl.DEBUGFUNCTION, self._curl_debug) self.c.setopt(pycurl.SSL_VERIFYPEER, 1 if verify_ssl else 0) self.c.setopt(pycurl.SSL_VERIFYHOST, 2 if verify_ssl else 0) if ca: logger.info("Setting CAINFO to %r", ca) self.c.setopt(pycurl.CAINFO, ca) self.c.setopt(pycurl.VERBOSE, 1 if verbose else 0) if username and password: username = username.encode('utf-8') password = password.encode('utf-8') self.c.setopt(pycurl.USERPWD, username + b":" + password) if client_cert and client_key: self.c.setopt(pycurl.SSLCERTTYPE, "PEM") self.c.setopt(pycurl.SSLKEYTYPE, "PEM") self.c.setopt(pycurl.SSLCERT, client_cert) self.c.setopt(pycurl.SSLKEY, client_key) if data: # curl sets the method to post if one sets any POSTFIELDS (even '') self.c.setopt(pycurl.POSTFIELDS, data) if use_json: headers['Content-Type'] = b'application/json' if allow_redirects: self.c.setopt(pycurl.FOLLOWLOCATION, 1) if kerberos_auth: self.c.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_GSSNEGOTIATE) self.c.setopt(pycurl.USERPWD, b':') if stream: headers['Cache-Control'] = b'no-cache' if headers: header_list = [] for header_key, header_value in headers.items(): header_list.append(str("%s: %s" % (header_key, header_value))) self.c.setopt(pycurl.HTTPHEADER, header_list) self.curl_multi.add_handle(self.c) # Send request and read all headers. We have all headers once we receive some data or once # the response ends. # NOTE: HTTP response in chunked encoding can contain additional headers ("trailers") in the # last chunk. This is not handled here. while not (self.finished or self._any_data_received()): self._select() self._perform() self.headers = parse_headers(self.headers_buffer.getvalue()) self.status_code = self.c.getinfo(pycurl.HTTP_CODE) self.response_decoder = codecs.getincrementaldecoder(self.encoding)()
def test_multi_socket_select(self): sockets = set() timeout = 0 urls = [ 'http://localhost:8380/success', 'http://localhost:8381/success', 'http://localhost:8382/success', ] socket_events = [] # socket callback def socket(event, socket, multi, data): if event == pycurl.POLL_REMOVE: #print("Remove Socket %d"%socket) sockets.remove(socket) else: if socket not in sockets: #print("Add socket %d"%socket) sockets.add(socket) socket_events.append((event, multi)) # init m = pycurl.CurlMulti() m.setopt(pycurl.M_PIPELINING, 1) m.setopt(pycurl.M_SOCKETFUNCTION, socket) m.handles = [] for url in urls: c = pycurl.Curl() # save info in standard Python attributes c.url = url c.body = util.StringIO() c.http_code = -1 m.handles.append(c) # pycurl API calls c.setopt(c.URL, c.url) c.setopt(c.WRITEFUNCTION, c.body.write) m.add_handle(c) # get data num_handles = len(m.handles) while (pycurl.E_CALL_MULTI_PERFORM==m.socket_all()[0]): pass timeout = m.timeout() # timeout might be -1, indicating that all work is done # XXX make sure there is always work to be done here? while timeout >= 0: (rr, wr, er) = select.select(sockets,sockets,sockets,timeout/1000.0) socketSet = set(rr+wr+er) if socketSet: for s in socketSet: while True: (ret,running) = m.socket_action(s,0) if ret!=pycurl.E_CALL_MULTI_PERFORM: break else: (ret,running) = m.socket_action(pycurl.SOCKET_TIMEOUT,0) if running==0: break for c in m.handles: # save info in standard Python attributes c.http_code = c.getinfo(c.HTTP_CODE) # at least in and remove events per socket assert len(socket_events) >= 6, 'Less than 6 socket events: %s' % repr(socket_events) # print result for c in m.handles: self.assertEqual('success', c.body.getvalue()) self.assertEqual(200, c.http_code) # multi, not curl handle self.check(pycurl.POLL_IN, m, socket_events) self.check(pycurl.POLL_REMOVE, m, socket_events) # close handles for c in m.handles: # pycurl API calls m.remove_handle(c) c.close() m.close()
def crawl(urls, sbook, fbook, num_conn=500): #! /usr/bin/env python # -*- coding: iso-8859-1 -*- # vi:ts=4:et # $Id: retriever-multi.py,v 1.29 2005/07/28 11:04:13 mfx Exp $ # # Usage: python retriever-multi.py <file with URLs to fetch> [<# of # concurrent connections>] # import sys import pycurl # We should ignore SIGPIPE when using pycurl.NOSIGNAL - see # the libcurl tutorial for more info. try: import signal from signal import SIGPIPE, SIG_IGN signal.signal(signal.SIGPIPE, signal.SIG_IGN) except ImportError: pass # Make a queue with (url, filename) tuplev").hes queue = [] for url in urls: url = url.strip() if not url or url[0] == "#": continue filename = str(md5.new(url).hexdigest()) + ".uss" queue.append((url, filename)) # Check args assert queue, "no URLs given" num_urls = len(queue) num_conn = min(num_conn, num_urls) assert 1 <= num_conn <= 10000, "invalid number of concurrent connections" print "PycURL %s (compiled against 0x%x)" % ( pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM) print "----- Getting", num_urls, "URLs using", num_conn, "connections -----" # Pre-allocate a list of curl objects m = pycurl.CurlMulti() m.handles = [] for i in range(num_conn): c = pycurl.Curl() c.fp = None c.setopt(pycurl.FOLLOWLOCATION, 1) c.setopt(pycurl.MAXREDIRS, 3) c.setopt(pycurl.CONNECTTIMEOUT, 120) c.setopt(pycurl.TIMEOUT, 300) c.setopt(pycurl.NOSIGNAL, 1) m.handles.append(c) # Main loop freelist = m.handles[:] num_processed = 0 while num_processed < num_urls: # If there is an url to process and a free curl object, add to multi stack while queue and freelist: url, filename = queue.pop(0) c = freelist.pop() c.fp = open(filename, "wb") c.setopt(pycurl.URL, url) c.setopt(pycurl.WRITEDATA, c.fp) m.add_handle(c) # store some info c.filename = filename c.url = url # Run the internal curl state machine for the multi stack while 1: ret, num_handles = m.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break # Check for curl objects which have terminated, and add them to the freelist while 1: num_q, ok_list, err_list = m.info_read() for c in ok_list: c.fp.close() c.fp = None m.remove_handle(c) print "Success:", c.filename, c.url, c.getinfo( pycurl.EFFECTIVE_URL) pattern = "-->" + str( c.filename) + ":::" + str(c.url) + ":::" + str( c.getinfo(pycurl.EFFECTIVE_URL)) + chr(10) sbook.write(pattern) sbook.flush() freelist.append(c) for c, errno, errmsg in err_list: c.fp.close() c.fp = None m.remove_handle(c) print "Failed: ", c.filename, c.url, errno, errmsg pattern = "-->" + str(c.filename) + ":::" + str( c.url) + ":::" + str(errno) + ":::" + str(errmsg) + chr(10) fbook.write(pattern) fbook.flush() freelist.append(c) num_processed = num_processed + len(ok_list) + len(err_list) if num_q == 0: break # Currently no more I/O is pending, could do something in the meantime # (display a progress bar, etc.). # We just call select() to sleep until some more data is available. m.select(1.0) # Cleanup for c in m.handles: if c.fp is not None: c.fp.close() c.fp = None c.close() m.close()
def curlcrawl(urls, num_conn=1, maxlink=100, dumpdir=None, mode=0750): """ crawl a list of sites. OA this function contains urlmap dict which we keeps on growing. Ideally there should be very limited amount of urls. This method cause memory to bloat. Need to improve upon this. """ totalfetched = 0 try: import signal from signal import SIGPIPE, SIG_IGN signal.signal(signal.SIGPIPE, signal.SIG_IGN) except ImportError: pass queue = [] # urlmap will keep on growing so we need to call this functon with better strcuture urlmap = dict() linkcounts = dict() globalbuffer = dict() for url in urls: url = url.strip() if not url: continue queue.append(url) linkcounts[urlparse.urlparse(url).netloc] = 0 urlmap[url] = urlparse.urlparse(url).netloc # print queue num_urls = len(queue) num_conn = min(num_conn, num_urls) assert 1 <= num_conn <= 10000, "invalid number of concurrent connections" m = pycurl.CurlMulti() m.handles = [] for i in range(num_conn): c = pycurl.Curl() c.fp = None c.setopt(pycurl.FOLLOWLOCATION, 1) c.setopt(pycurl.MAXREDIRS, 5) c.setopt(pycurl.CONNECTTIMEOUT, 30) c.setopt(pycurl.TIMEOUT, 300) c.setopt(pycurl.NOSIGNAL, 1) m.handles.append(c) freelist = m.handles[:] num_processed = 0 while num_processed < num_urls: # If there is an url to process and a free curl object, add to multi stack while queue and freelist: url = queue.pop(0) c = freelist.pop() # globalbuffer[url] = StringIO.StringIO() globalbuffer[url] = urlbuffer(dumpdir, url, mode) c.fp = globalbuffer[url].write # print url c.setopt(pycurl.URL, url.encode('utf-8')) # in following use WRITE_FUNC to write the data STORE use from StringIO import StringIO c.setopt(pycurl.WRITEFUNCTION, c.fp) # print 'adding url ', ' ', url m.add_handle(c) # store some info c.url = url # Run the internal curl state machine for the multi stack while 1: ret, num_handles = m.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break # Check for curl objects which have terminated, and add them to the freelist while 1: # numq is the number of messages still queued num_q, ok_list, err_list = m.info_read() for c in ok_list: c.fp = None m.remove_handle(c) eurl = c.getinfo(pycurl.EFFECTIVE_URL) parent = urlmap[c.url] freelist.append(c) if linkcounts[parent] > maxlink: htmlc = None links = getlinks(globalbuffer[c.url].getvalue(), eurl, parent) # print 'gt links' + len(links) # if dumpdir is not None: for link in links: if linkcounts[parent] <= maxlink and urlmap.get( link) is None: queue.append(link) urlmap[link] = parent num_urls = num_urls + 1 linkcounts[parent] = linkcounts[parent] + 1 totalfetched = totalfetched + 1 # print 'fetched ', ' ', eurl for c, errno, errmsg in err_list: c.fp = None m.remove_handle(c) # print "Failed: ", c.url, errno, errmsg freelist.append(c) num_processed = num_processed + len(ok_list) + len(err_list) if num_q == 0: break # Currently no more I/O is pending, could do something in the meantime # (display a progress bar, etc.). # We just call select() to sleep until some more data is available. m.select(1.0) for c in m.handles: if c.fp is not None: c.fp = None c.close() m.close() return totalfetched
def download( concurrent_connections, iterator, save_result, max_redirects=DEFAULT_MAX_REDIRECTS, connect_timeout_seconds=DEFAULT_CONNECT_TIMEOUT_SECONDS, timeout_seconds=DEFAULT_TIMEOUT_SECONDS, ): # We should ignore SIGPIPE when using pycurl.NOSIGNAL - see # the libcurl tutorial for more info. signal.signal(SIGPIPE, SIG_IGN) curl_multi = pycurl.CurlMulti() curl_multi.handles = [] for i in range(concurrent_connections): curl = pycurl.Curl() curl.fp = None curl.setopt(pycurl.FOLLOWLOCATION, 1) curl.setopt(pycurl.MAXREDIRS, max_redirects) curl.setopt(pycurl.CONNECTTIMEOUT, connect_timeout_seconds) curl.setopt(pycurl.TIMEOUT, int(timeout_seconds)) curl.setopt(pycurl.NOSIGNAL, 1) curl_multi.handles.append(curl) try: freelist = curl_multi.handles[:] while True: while len(freelist) > 0: urlobj = next(iterator) if urlobj is None: break else: curl = freelist.pop() curl.setopt(pycurl.URL, urlobj['url']) curl.fp = open(urlobj['output_filename'], "wb") curl.hfp = open( urlobj['header_filename'], "wb") if urlobj.get( 'header_filename') is not None else None curl.setopt(pycurl.WRITEDATA, curl.fp) if curl.hfp is not None: curl.setopt(pycurl.WRITEHEADER, curl.hfp) curl_multi.add_handle(curl) curl.urlobj = urlobj if len(freelist) == concurrent_connections: time.sleep(SLEEP_TIME_SECONDS_IF_NONE_RUNNING) else: while True: ret, num_running_handles = curl_multi.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break while True: num_handles_in_queue, ok_list, err_list = curl_multi.info_read( ) for curl in ok_list: curl.fp.close() curl.fp = None if curl.hfp is not None: curl.hfp.close() curl.hfp = None curl_multi.remove_handle(curl) save_result(curl.urlobj, response_code=curl.getinfo( pycurl.RESPONSE_CODE)) curl.urlobj = None freelist.append(curl) for curl, errno, errmsg in err_list: curl.fp.close() curl.fp = None if curl.hfp is not None: curl.hfp.close() curl.hfp = None curl_multi.remove_handle(curl) save_result(curl.urlobj, errno=errno, errmsg=errmsg) curl.urlobj = None freelist.append(curl) if num_handles_in_queue == 0: break curl_multi.select(1.0) finally: for curl in curl_multi.handles: if getattr(curl, 'fp', None) is not None: curl.fp.close() curl.fp = None if getattr(curl, 'hfp', None) is not None: curl.hfp.close() curl.hfp = None curl.urlobj = None curl.close() curl_multi.close()
def muti_curl(): # 最大连接数 num_conn = 20 queue = [] # urls = ['http://www.baidu.com/'] * 10 urls = ['http://127.0.0.1:8082/'] * 10 for url in urls: queue.append(url) num_urls = len(queue) num_conn = min(num_conn, num_urls) print('----- Getting', num_urls, 'Max conn', num_conn, 'connections -----') m = pycurl.CurlMulti() # 初始化handle,可复用 m.handles = [] for i in range(num_conn): c = pycurl.Curl() # c.body = StringIO() c.body = BytesIO() c.setopt(pycurl.FOLLOWLOCATION, 1) c.setopt(pycurl.MAXREDIRS, 5) c.setopt(pycurl.CONNECTTIMEOUT, 30) c.setopt(pycurl.TIMEOUT, 300) c.setopt(pycurl.NOSIGNAL, 1) m.handles.append(c) freelist = m.handles[:] num_processed = 0 # 主循环开始 while num_processed < num_urls: # 添加请求URL while queue and freelist: url = queue.pop() c = freelist.pop() c.setopt(pycurl.URL, url) c.setopt(pycurl.WRITEFUNCTION, c.body.write) m.add_handle(c) c.url = url # print url # 执行请求 while 1: (ret, num_handles) = m.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break # 阻塞一会直到有连接完成 m.select(1.0) # 读取完成的连接 while 1: (num_q, ok_list, err_list) = m.info_read() for c in ok_list: m.remove_handle(c) # print c.body.getvalue() freelist.append(c) for (c, errno, errmsg) in err_list: m.remove_handle(c) print('Failed: ', c.url, errno, errmsg) freelist.append(c) num_processed = num_processed + len(ok_list) + len(err_list) if num_q == 0: break for c in m.handles: c.fp = None c.close() m.close()
def test_keyword_arguments(self): pycurl.CurlMulti(a=1)
def __init__(self, in_max_requests = 10, in_options = {}): self.max_requests = in_max_requests self.options = in_options self.outstanding_requests = {} self.multi_handle = pycurl.CurlMulti()
def download(url, target_file=None): "Download a file in parts, displaying progress. This function returns when the download is complete." sys.stdout.write("\033[s") manager = pycurl.CurlMulti() state, state_shelf_file_name = get_state_shelf(url, target_file) if state and "file_size" in state: # Continuation of earlier run state["parts"] = {} state["new_handles"] = [] state["manager"] = manager state["fd"] = open(state["target_file"], "rb+") download_loop_handle_info_read(state) else: state.update({ "parts": {}, "done": IntervalSet(), "url": url, "manager": manager, "new_handles": [], "target_file": target_file }) gen_curl(state, "0-") # cURL main loop: # Handle cURL events and add new handles as they become available (they are # spawned above in the progress callback) while True: while state["new_handles"]: manager.add_handle(state["new_handles"].pop(0)) ret, num_handles = manager.perform() download_loop_handle_info_read(state) if ret != pycurl.E_CALL_MULTI_PERFORM: break while num_handles or state["new_handles"]: update_progress(state) while state["new_handles"]: manager.add_handle(state["new_handles"].pop(0)) if manager.select(1) == -1: continue while True: while state["new_handles"]: manager.add_handle(state["new_handles"].pop(0)) ret, num_handles = manager.perform() download_loop_handle_info_read(state) if ret != pycurl.E_CALL_MULTI_PERFORM: break # Write remaining data for download in state["parts"]: flush_buffer(state, download) update_progress(state) if "fd" in state: state["fd"].close() state.sync() fail_state = False if "canceled" in state: # KeyboardInterrupt -- do not delete intermediates in this case. fail_state = True else: # Check for errors if "file_size" in state and state["file_size"]: covered = IntervalSet() covered.add((0, state["file_size"] - 1)) left_over = covered - state["done"] if left_over.contained: print( "\nSome parts of the file failed to download, namely bytes %s" % left_over) fail_state = True for download, dl_state in list(state["parts"].items()): if "cancel_status" in dl_state and dl_state[ "cancel_status"] == "deliberate": continue errstr = download.errstr() if errstr: if not fail_state: print() print(errstr) fail_state = True if fail_state: bytes_written = sum([b - a + 1 for a, b in state["done"].contained]) if not bytes_written: os.unlink(state["target_file"]) del state os.unlink(state_shelf_file_name) else: error_output("Download finished.") del state os.unlink(state_shelf_file_name) return fail_state
def download(queue_type, queue_directory, output_directory, concurrent_connections, max_downloads=None): if queue_type == 'timedout': timeout_seconds = MAX_TIMEOUT_SECONDS else: timeout_seconds = MIN_TIMEOUT_SECONDS # We should ignore SIGPIPE when using pycurl.NOSIGNAL - see # the libcurl tutorial for more info. signal.signal(SIGPIPE, SIG_IGN) m = pycurl.CurlMulti() m.handles = [] for i in range(int(concurrent_connections)): c = pycurl.Curl() c.fp = None c.setopt(pycurl.FOLLOWLOCATION, 1) c.setopt(pycurl.MAXREDIRS, DOWNLOAD_MAX_REDIRECTS) c.setopt(pycurl.CONNECTTIMEOUT, DOWNLOAD_CONNECT_TIMEOUT) c.setopt(pycurl.TIMEOUT, int(timeout_seconds)) c.setopt(pycurl.NOSIGNAL, 1) m.handles.append(c) start_time = datetime.datetime.now() try: while True: num_processed = 0 reached_max_downloads = False total_read_lines = 0 domains_last_start_times = {} skipped_due_to_domain_start_time = 0 total_stats = { 'num_existing_hash_id': 0, 'num_new_hash_id': 0, 'num_error_urls': 0, 'num_timeout_urls': 0 } freelist = m.handles[:] eof = False downloaded_url_ids = set() if os.path.exists(os.path.join(queue_directory, 'output.txt')): with open(os.path.join(queue_directory, 'output.txt')) as f: for line in f: downloaded_url_ids.add(int(line.strip())) with open(os.path.join(queue_directory, 'output.txt'), 'a') as output_file: with tempfile.TemporaryDirectory() as tmpdir: def save_result(url, url_id, errno=None, errmsg=None, response_code=None): is_timeout = errno == pycurl.E_OPERATION_TIMEDOUT now = datetime.datetime.now() url_relative_output_dir = os.path.join( str(now.year), str(now.month), str(now.day), str(now.hour), str(now.minute), str(url_id)) url_output_dir = os.path.join(output_directory, url_relative_output_dir) url_relative_output_filename = os.path.join( url_relative_output_dir, "output") output_filename = os.path.join(tmpdir, str(url_id), "output") header_filename = os.path.join(tmpdir, str(url_id), "header") hash_id = None if errno is None and response_code == 200: filesize = os.path.getsize(output_filename) if filesize > 0: hasher = hashlib.sha256() with open(output_filename, 'rb') as f: buf = f.read(HASH_BLOCKSIZE) while len(buf) > 0: hasher.update(buf) buf = f.read(HASH_BLOCKSIZE) hash = hasher.hexdigest() try: db.execute( "insert into hash (hash, size_bytes, download_path, downloaded_at) values (%s, %s, %s, %s)", (hash, filesize, url_relative_output_filename, datetime.datetime.now().strftime( DATETIME_FORMAT))) total_stats['num_new_hash_id'] += 1 os.makedirs(url_output_dir) os.rename( output_filename, os.path.join( output_directory, url_relative_output_filename)) except db.UniqueViolation: total_stats['num_existing_hash_id'] += 1 os.unlink(output_filename) hash_id = db.only_one( "select id from hash where hash=%s and size_bytes=%s", (hash, filesize))['id'] else: if is_timeout: total_stats['num_timeout_urls'] += 1 else: total_stats['num_error_urls'] += 1 os.unlink(output_filename) os.unlink(header_filename) os.rmdir(os.path.join(tmpdir, str(url_id))) url_update_history_id = db.only_one( "insert into url_update_history (url_id, updated_at, hash_id, error, error_code, timedout_seconds) values (%s, %s, %s, %s, %s, %s) RETURNING id", (url_id, datetime.datetime.now().strftime(DATETIME_FORMAT), hash_id, errmsg, errno or response_code, timeout_seconds if is_timeout else None))['id'] try: db.execute( "insert into url_last_update (url_id, url_update_history_id) values (%s, %s)", (url_id, url_update_history_id)) except db.UniqueViolation: db.execute( "update url_last_update set url_update_history_id=%s where url_id=%s", (url_update_history_id, url_id)) if hash_id: try: db.execute( "insert into url_last_successful_update (url_id, url_update_history_id) values (%s, %s)", (url_id, url_update_history_id)) except db.UniqueViolation: db.execute( "update url_last_successful_update set url_update_history_id=%s where url_id=%s", (url_update_history_id, url_id)) output_file.write(str(url_id) + "\n") with open(os.path.join(queue_directory, 'queue.txt')) as queue_file: while True: while freelist and not eof: line = queue_file.readline() if line == '': eof = True else: total_read_lines += 1 tmp = line.strip().split(" ") url_id, url = tmp[0], ' '.join(tmp[1:]) if int(url_id) not in downloaded_url_ids: domain = url.split('://')[1].split( '/')[0] domain_last_start_time = domains_last_start_times.get( domain, None) now = datetime.datetime.now() if not domain_last_start_time or ( now - domain_last_start_time ).total_seconds( ) >= DOWNLOAD_DOMAIN_THROTTLE_SECONDS: domains_last_start_times[ domain] = now c = freelist.pop() os.mkdir( os.path.join( tmpdir, str(url_id))) c.fp = open( os.path.join( tmpdir, str(url_id), "output"), "wb") c.hfp = open( os.path.join( tmpdir, str(url_id), "header"), "wb") c.setopt(pycurl.URL, url) c.setopt(pycurl.WRITEDATA, c.fp) c.setopt(pycurl.WRITEHEADER, c.hfp) m.add_handle(c) c.url_id = url_id c.url = url else: skipped_due_to_domain_start_time += 1 while True: ret, num_handles = m.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break while True: num_q, ok_list, err_list = m.info_read() for c in ok_list: c.fp.close() c.fp = None c.hfp.close() c.hfp = None m.remove_handle(c) save_result(c.url, c.url_id, response_code=c.getinfo( pycurl.RESPONSE_CODE)) freelist.append(c) for c, errno, errmsg in err_list: c.fp.close() c.fp = None c.hfp.close() c.hfp = None m.remove_handle(c) save_result(c.url, c.url_id, errno=errno, errmsg=errmsg) freelist.append(c) num_processed = num_processed + len( ok_list) + len(err_list) if max_downloads and num_processed >= int( max_downloads): reached_max_downloads = True break if num_q == 0: break if num_q == 0 and num_handles == 0 and eof: break if max_downloads and num_processed >= int( max_downloads): reached_max_downloads = True break m.select(1.0) if len(downloaded_url_ids) == total_read_lines: break elif (start_time - datetime.datetime.now() ).total_seconds() > MAX_DOWNLOAD_RUNTIME_SECONDS: break else: time.sleep(DOWNLOAD_ITERATIONS_SLEEP_SECONDS) finally: for c in m.handles: if getattr(c, 'fp', None) is not None: c.fp.close() c.fp = None if getattr(c, 'hfp', None) is not None: c.hfp.close() c.hfp = None c.close() m.close() return (len(downloaded_url_ids), num_processed, reached_max_downloads, total_read_lines, skipped_due_to_domain_start_time, total_stats['num_existing_hash_id'], total_stats['num_new_hash_id'], total_stats['num_error_urls'], total_stats['num_timeout_urls'])
def check_pause(self, call): # the app sleeps for 0.5 seconds self.curl.setopt(pycurl.URL, 'http://localhost:8380/pause') sio = util.BytesIO() state = dict(paused=False, resumed=False) if call: def writefunc(data): rv = sio.write(data) if not state['paused']: self.curl.pause(pycurl.PAUSE_ALL) state['paused'] = True return rv else: def writefunc(data): if not state['paused']: # cannot write to sio here, because # curl takes pause return value to mean that # nothing was written state['paused'] = True return pycurl.READFUNC_PAUSE else: return sio.write(data) def resume(*args): state['resumed'] = True self.curl.pause(pycurl.PAUSE_CONT) signal.signal(signal.SIGALRM, resume) # alarm for 1 second which is 0.5 seconds more than the server side # should sleep for signal.alarm(1) start = _time.time() self.curl.setopt(pycurl.WRITEFUNCTION, writefunc) m = pycurl.CurlMulti() m.add_handle(self.curl) # Number of seconds to wait for a timeout to happen SELECT_TIMEOUT = 1.0 # Stir the state machine into action while 1: ret, num_handles = m.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break # Keep going until all the connections have terminated while num_handles: # The select method uses fdset internally to determine which file descriptors # to check. m.select(SELECT_TIMEOUT) while 1: if _time.time() - start > 2: # test is taking too long, fail assert False, 'Test is taking too long' ret, num_handles = m.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break # Cleanup m.remove_handle(self.curl) m.close() self.assertEqual('part1part2', sio.getvalue().decode()) end = _time.time() # check that client side waited self.assertTrue(end - start > 1) assert state['resumed']
def test_multi_socket(self): urls = [ 'http://localhost:8380/success', 'http://localhost:8381/success', 'http://localhost:8382/success', ] socket_events = [] # socket callback def socket(event, socket, multi, data): #print(event, socket, multi, data) socket_events.append((event, multi)) # init m = pycurl.CurlMulti() m.setopt(pycurl.M_PIPELINING, 1) m.setopt(pycurl.M_SOCKETFUNCTION, socket) m.handles = [] for url in urls: c = pycurl.Curl() # save info in standard Python attributes c.url = url c.body = util.StringIO() c.http_code = -1 m.handles.append(c) # pycurl API calls c.setopt(c.URL, c.url) c.setopt(c.WRITEFUNCTION, c.body.write) m.add_handle(c) # get data num_handles = len(m.handles) while num_handles: while 1: ret, num_handles = m.socket_all() if ret != pycurl.E_CALL_MULTI_PERFORM: break # currently no more I/O is pending, could do something in the meantime # (display a progress bar, etc.) m.select(0.1) for c in m.handles: # save info in standard Python attributes c.http_code = c.getinfo(c.HTTP_CODE) # at least in and remove events per socket assert len(socket_events) >= 6 # print result for c in m.handles: self.assertEqual('success', c.body.getvalue()) self.assertEqual(200, c.http_code) # multi, not curl handle self.check(pycurl.POLL_IN, m, socket_events) self.check(pycurl.POLL_REMOVE, m, socket_events) # close handles for c in m.handles: # pycurl API calls m.remove_handle(c) c.close() m.close()
def __init_curl(self, urls, maxconn=20): total = len(urls) num_conn = min(total, maxconn) m = pycurl.CurlMulti() m.handles = [] for i in range(num_conn): c = pycurl.Curl() c.html = None c = self.__setopt(c) m.handles.append(c) freelist = m.handles[:] num_processed = 0 while num_processed < total: while urls and freelist: c = freelist.pop() c.k, c.url = urls.pop(0) c.html = StringIO.StringIO() c.setopt(pycurl.URL, c.url) c.setopt(pycurl.WRITEFUNCTION, c.html.write) m.add_handle(c) while 1: ret, num_handles = m.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break while 1: num, ok, err = m.info_read() for c in ok: if len(re.findall("Error [#]2012", c.html.getvalue())) > 0: print " | Patent Errored Out Bro!" return self.sql.c.execute( "INSERT OR REPLACE INTO {tbl} (key, url, html, created) VALUES (?, ?, ?, ?)" .format(tbl=self.SQLtbl), (c.k, c.url, senAdd.uni2asc( c.html.getvalue()), time.time())) c.html.close() c.html = None m.remove_handle(c) freelist.append(c) ## for c in err: ## try: ## print " > error: {key}, {url}".format(key=c.k, url=c.url) ## except: ## print " > error" sys.stdout.write("{clear} - {x}".format(clear="\b" * 20, x=num_processed)) num_processed = num_processed + len(ok) + len(err) if num_processed % 800 == 0 and num_processed > 0: self.sql.conn.commit() if num == 0: break for c in m.handles: if c.html is not None: c.html.close() c.html = None c.close() m.close() self.sql.conn.commit() print ""
class DahuaEventThread(threading.Thread): """Connects to device and subscribes to events""" Devices = [] NumActivePlayers = 0 CurlMultiObj = pycurl.CurlMulti() NumCurlObjs = 0 def __init__(self, mqtt, cameras): """Construct a thread listening for events.""" self.basetopic = mqtt["basetopic"] self.client = paho.Client("CameraEvents-" + socket.gethostname(), clean_session=True) self.client.on_connect = self.mqtt_on_connect self.client.on_disconnect = self.mqtt_on_disconnect self.client.message_callback_add(self.basetopic + "/+/picture", self.mqtt_on_picture_message) self.client.message_callback_add(self.basetopic + "/+/alerts", self.mqtt_on_alert_message) self.client.will_set(self.basetopic + "/$online", False, qos=0, retain=True) self.alerts = True for device_cfg in cameras: device = DahuaDevice(device_cfg.get("name"), device_cfg, self.client, self.basetopic) self.Devices.append(device) CurlObj = pycurl.Curl() device.CurlObj = CurlObj CurlObj.setopt(pycurl.URL, device.url) CurlObj.setopt(pycurl.CONNECTTIMEOUT, 30) CurlObj.setopt(pycurl.TCP_KEEPALIVE, 1) CurlObj.setopt(pycurl.TCP_KEEPIDLE, 30) CurlObj.setopt(pycurl.TCP_KEEPINTVL, 15) if device.auth == 'digest': CurlObj.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_DIGEST) CurlObj.setopt(pycurl.USERPWD, "%s:%s" % (device.user, device.password)) else: CurlObj.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH) CurlObj.setopt(pycurl.USERPWD, "%s:%s" % (device.user, device.password)) CurlObj.setopt(pycurl.WRITEFUNCTION, device.OnReceive) self.CurlMultiObj.add_handle(CurlObj) self.NumCurlObjs += 1 _LOGGER.debug("Added Dahua device at: %s", device.url) #connect to mqtt broker _LOGGER.debug("Connecting to MQTT Broker") self.client.connect(mqtt["IP"], int(mqtt["port"]), 60) _LOGGER.debug("Starting MQTT Loop") self.client.loop_start() threading.Thread.__init__(self) self.stopped = threading.Event() def run(self): heartbeat = 0 """Fetch events""" while 1: Ret, NumHandles = self.CurlMultiObj.perform() if Ret != pycurl.E_CALL_MULTI_PERFORM: break Ret = self.CurlMultiObj.select(1.0) while not self.stopped.isSet(): # Sleeps to ease load on processor time.sleep(.05) heartbeat = heartbeat + 1 if heartbeat % 1000 == 0: _LOGGER.debug("Heartbeat: " + str(datetime.datetime.now())) if not self.client.connected_flag: self.client.reconnect() self.client.publish(self.basetopic + "/$heartbeat", str(datetime.datetime.now())) Ret, NumHandles = self.CurlMultiObj.perform() if NumHandles != self.NumCurlObjs: _, Success, Error = self.CurlMultiObj.info_read() for CurlObj in Success: DahuaDevice = next( iter( filter(lambda x: x.CurlObj == CurlObj, self.Devices)), None) if DahuaDevice.Reconnect: _LOGGER.debug("Dahua Reconnect: %s", DahuaDevice.Name) continue DahuaDevice.OnDisconnect("Success") DahuaDevice.Reconnect = time.time() + 5 for CurlObj, ErrorNo, ErrorStr in Error: DahuaDevice = next( iter( filter(lambda x: x.CurlObj == CurlObj, self.Devices)), None) if DahuaDevice.Reconnect: continue DahuaDevice.OnDisconnect("{0} ({1})".format( ErrorStr, ErrorNo)) DahuaDevice.Reconnect = time.time() + 5 for DahuaDevice in self.Devices: if DahuaDevice.Reconnect and DahuaDevice.Reconnect < time.time( ): self.CurlMultiObj.remove_handle(DahuaDevice.CurlObj) self.CurlMultiObj.add_handle(DahuaDevice.CurlObj) DahuaDevice.Reconnect = None #if Ret != pycurl.E_CALL_MULTI_PERFORM: break def mqtt_on_connect(self, client, userdata, flags, rc): if rc == 0: _LOGGER.info("Connected to MQTT OK Returned code={0}".format(rc)) self.client.connected_flag = True self.client.publish(self.basetopic + "/$online", True, qos=0, retain=True) self.client.publish(self.basetopic + "/$version", version) if self.alerts: state = "ON" else: state = "OFF" for device in self.Devices: device.alerts = state self.client.publish( self.basetopic + "/" + device.Name + "/alerts/state", state) #self.client.subscribe(self.basetopic +"/#") #self.client.subscribe("CameraEventsPy/alerts") else: _LOGGER.info( "Camera : {0}: Bad mqtt connection Returned code={1}".format( "self.Name", rc)) self.client.connected_flag = False def mqtt_on_disconnect(self, client, userdata, rc): logging.info("disconnecting reason " + str(rc)) self.client.connected_flag = False def mqtt_on_picture_message(self, client, userdata, msg): #if msg.payload.decode() == "Hello world!": _LOGGER.info("Picture Msg Received: Topic:{0} Payload:{1}".format( msg.topic, msg.payload)) msgchannel = msg.topic.split("/")[1] for device in self.Devices: channel = device.channelIsMine(msgchannel) if channel > -1: _LOGGER.debug( "Found Camera: {0} channel: {1}: Name:{2}".format( device.Name, channel, device.channels[channel])) device.SnapshotImage(channel + device.snapshotoffset, msgchannel, "Snap Shot Image") break def mqtt_on_alert_message(self, client, userdata, msg): if msg.payload == 'ON': newState = True else: newState = False deviceName = msg.topic.split('/')[1] _LOGGER.info("Camera: {0}: Msg Received: Topic:{1} Payload:{2}".format( deviceName, msg.topic, msg.payload)) for device in self.Devices: #channel = self.Devices[device].channelIsMine("Garage") if device.Name == deviceName: device.alerts = newState _LOGGER.info("Turning Alerts {0}".format(newState)) self.client.publish( self.basetopic + "/" + device.Name + "/alerts/state", msg.payload) def mqtt_on_cross_message(self, client, userdata, msg): if msg.payload == 'ON': newState = True else: newState = False deviceName = msg.topic.split('/')[1] _LOGGER.info("Camera: {0}: Msg Received: Topic:{1} Payload:{2}".format( deviceName, msg.topic, msg.payload)) for device in self.Devices: #channel = self.Devices[device].channelIsMine("Garage") if device.Name == deviceName: device.alerts = newState _LOGGER.info("Turning Alerts {0}".format(newState)) self.client.publish( self.basetopic + "/" + device.Name + "/alerts/state", msg.payload)
def test_multi_timer(self): urls = [ 'http://localhost:8380/success', 'http://localhost:8381/success', 'http://localhost:8382/success', ] timers = [] # timer callback def timer(msecs): #print('Timer callback msecs:', msecs) timers.append(msecs) # init m = pycurl.CurlMulti() m.setopt(pycurl.M_TIMERFUNCTION, timer) m.handles = [] for url in urls: c = pycurl.Curl() # save info in standard Python attributes c.url = url c.body = util.BytesIO() c.http_code = -1 m.handles.append(c) # pycurl API calls c.setopt(c.URL, c.url) c.setopt(c.WRITEFUNCTION, c.body.write) m.add_handle(c) # get data num_handles = len(m.handles) while num_handles: while 1: ret, num_handles = m.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break # currently no more I/O is pending, could do something in the meantime # (display a progress bar, etc.) m.select(1.0) for c in m.handles: # save info in standard Python attributes c.http_code = c.getinfo(c.HTTP_CODE) # print result for c in m.handles: self.assertEqual('success', c.body.getvalue().decode()) self.assertEqual(200, c.http_code) assert len(timers) > 0 # libcurl 7.23.0 produces a 0 timer assert timers[0] >= 0 # this assertion does not appear to hold on older libcurls # or apparently on any linuxes, see # https://github.com/p/pycurl/issues/19 #if not util.pycurl_version_less_than(7, 24): # self.assertEqual(-1, timers[-1]) # close handles for c in m.handles: # pycurl API calls m.remove_handle(c) c.close() m.close()
def multi_get(urls, num_conn, timeout, err_callback, succ_callback, ua='semRushBot', percentile=100): result = {} queue = deque(list(urls)) cur_percentile = 0 print_percentile = 0 if not queue: return num_urls = len(queue) num_conn = min(num_conn, num_urls) assert 1 <= num_conn <= 10000, "invalid number of concurrent connections" assert 1 <= percentile <= 100, "invalid percentile" logging.debug("PycURL %s (compiled against 0x%x)" % (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)) m = pycurl.CurlMulti() m.handles = [] for i in range(num_conn): c = pycurl.Curl() c.fp = None c.setopt(pycurl.FOLLOWLOCATION, 1) c.setopt(pycurl.MAXREDIRS, 1) c.setopt(pycurl.CONNECTTIMEOUT, timeout) c.setopt(pycurl.TIMEOUT, timeout) c.setopt(pycurl.NOSIGNAL, 1) c.setopt(pycurl.USERAGENT, ua) m.handles.append(c) freelist = m.handles[:] num_processed = 0 bailout = 0 while num_processed < num_urls: if bailout: break while queue and freelist: c = freelist.pop() c.props = queue.popleft() if type(c.props['url']) == type(u''): c.props['url'] = c.props['url'].encode('utf8', 'replace') c.setopt(pycurl.URL, c.props['url']) try: c.setopt(pycurl.COOKIE, str(c.props['cookie'])) except KeyError: pass c.source = cStringIO.StringIO() c.header = cStringIO.StringIO() c.setopt(pycurl.HEADERFUNCTION, c.header.write) c.setopt(pycurl.WRITEFUNCTION, c.source.write) m.add_handle(c) while 1: ret, num_handles = m.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break while 1: num_q, ok_list, err_list = m.info_read() for c in ok_list: c.fp = None m.remove_handle(c) logging.debug("[ ok] %s" % (c.props['url'])) succ_callback(c) freelist.append(c) for c, errno, errmsg in err_list: c.fp = None m.remove_handle(c) logging.debug("[err] %s %s" % (c.props['url'], errmsg)) err_callback(c) freelist.append(c) num_processed = num_processed + len(ok_list) + len(err_list) if num_urls: cur_percentile = round(float(num_processed) / num_urls * 100) if cur_percentile % 10 == 0 and 0 < cur_percentile < 100 and print_percentile != cur_percentile: logging.info("%d%%" % cur_percentile) print_percentile = cur_percentile if (cur_percentile >= percentile): bailout = 1 break if num_q == 0: break m.select(1.0) m.close() return result
def __init__( self, url=None, username=None, password=None, token=None, insecure=False, ca_file=None, debug=False, log=None, kerberos=False, timeout=0, compress=True, sso_url=None, sso_revoke_url=None, sso_token_name='access_token', headers=None, pipeline=0, connections=0, ): """ Creates a new connection to the API server. This method supports the following parameters: `url`:: A string containing the base URL of the server, usually something like `https://server.example.com/ovirt-engine/api`. `username`:: The name of the user, something like `admin@internal`. `password`:: The name password of the user. `token`:: : The token to be used to access API. Optionally, user can use token, instead of username and password to access API. If user don't specify `token` parameter, SDK will automatically create one. `insecure`:: A boolean flag that indicates if the server TLS certificate and host name should be checked. `ca_file`:: A PEM file containing the trusted CA certificates. The certificate presented by the server will be verified using these CA certificates. If `ca_file` parameter is not set, system wide CA certificate store is used. `debug`:: A boolean flag indicating if debug output should be generated. If the value is `True` and the `log` parameter isn't `None` then the data sent to and received from the server will be written to the log. Be aware that user names and passwords will also be written, so handle it with care. `log`:: The logger where the log messages will be written. `kerberos`:: A boolean flag indicating if Kerberos authentication should be used instead of the default basic authentication. `timeout`:: The maximum total time to wait for the response, in seconds. A value of zero (the default) means wait for ever. If the timeout expires before the response is received an exception will be raised. `compress`:: A boolean flag indicating if the SDK should ask the server to send compressed responses. The default is `True`. Note that this is a hint for the server, and that it may return uncompressed data even when this parameter is set to `True`. Note that compression will be disabled if user pass `debug` parameter set to `true`, so the debug messages are in plain text. `sso_url`:: A string containing the base SSO URL of the serve. Default SSO url is computed from the `url` if no `sso_url` is provided. `sso_revoke_url`:: A string containing the base URL of the SSO revoke service. This needs to be specified only when using an external authentication service. By default this URL is automatically calculated from the value of the `url` parameter, so that SSO token revoke will be performed using the SSO service that is part of the engine. `sso_token_name`:: The token name in the JSON SSO response returned from the SSO server. Default value is `access_token`. `headers`:: A dictionary with headers which should be send with every request. `connections`:: The maximum number of connections to open to the host. If the value is `0` (the default) then the number of connections will be unlimited. `pipeline`:: The maximum number of request to put in an HTTP pipeline without waiting for the response. If the value is `0` (the default) then pipelining is disabled. """ # Check mandatory parameters: if url is None: raise Error('The \'url\' parameter is mandatory') # Check that the CA file exists if insecure is not set: if not insecure: if ca_file is not None and not os.path.exists(ca_file): raise Error('The CA file \'%s\' doesn\'t exist' % ca_file) # Save the URL: self._url = url # Save the logger: self._log = log # Save the credentials: self._username = username self._password = password self._sso_token = token self._kerberos = kerberos self._ca_file = ca_file self._insecure = insecure self._timeout = timeout self._debug = debug self._compress = compress # The curl object can be used by several threads, but not # simultaneously, so we need a lock to prevent that: self._curl_lock = threading.Lock() # Set SSO attributes: self._sso_url = sso_url self._sso_revoke_url = sso_revoke_url self._sso_token_name = sso_token_name # Headers: self._headers = headers or {} # Create the curl handle that manages the pool of connections: self._multi = pycurl.CurlMulti() self._multi.setopt(pycurl.M_PIPELINING, bool(pipeline)) # Since libcurl 7.30.0: if hasattr(pycurl, 'M_MAX_PIPELINE_LENGTH'): self._multi.setopt(pycurl.M_MAX_PIPELINE_LENGTH, pipeline) self._multi.setopt(pycurl.M_MAX_HOST_CONNECTIONS, connections) # Connections: self._curls = set() # Initialize the reference to the system service: self.__system_service = None
def retrieve(self): if self.remote_inst_id is None: self.target_queue.append(RootTarget(self, self.endpoint_root)) num_conn = 1 full = True else: num_conn = self.get_cand_num_conn() if not num_conn: return False full = num_conn >= self.own_max_num_conn if full: num_conn = self.own_max_num_conn m = pycurl.CurlMulti() m.handles = [] for i in range(num_conn): c = pycurl.Curl() c.setopt(pycurl.CONNECTTIMEOUT, 30) c.setopt(pycurl.TIMEOUT, 300) m.handles.append(c) freelist = m.handles[:] num_started = 0 num_processed = 0 while True: target = None if freelist: target = self.pop_target() while target: assert freelist c = freelist.pop() num_started += 1 if target.get_verb() == 'DELETE': c.setopt(pycurl.CUSTOMREQUEST, 'DELETE') else: # reset after potential DELETE c.unsetopt(pycurl.CUSTOMREQUEST) c.setopt(pycurl.HTTPGET, True) c.setopt(pycurl.URL, target.url) c.target = target if target.has_plaintext_body(): c.setopt(pycurl.ENCODING, b'gzip') c.setopt(c.HEADERFUNCTION, target.handle_header) c.setopt(pycurl.WRITEDATA, target) m.add_handle(c) if freelist: target = self.pop_target() else: target = None while True: ret, num_handles = m.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break while True: num_q, ok_list, err_list = m.info_read() for c in ok_list: target = c.target m.remove_handle(c) msg_verb = "deleted" if target.get_verb() == 'DELETE' else "got" eff_url = c.getinfo(pycurl.EFFECTIVE_URL) msg = msg_verb + " " + eff_url if not target.succeeded(): if target.http_code is None: msg += " with no HTTP status" else: msg += " with %d" % target.http_code print(msg, file=sys.stderr) target.close() c.target = None freelist.append(c) for c, errno, errmsg in err_list: target = c.target target.close() c.target = None m.remove_handle(c) self.report_error(target, errno, errmsg) freelist.append(c) num_processed += len(ok_list) + len(err_list) if self.healthcheck_interval and (self.total_processed >= self.total_checked + self.healthcheck_interval): self.healthcheck() if num_q == 0: break if num_started == num_processed: break m.select(1.0) for c in m.handles: if hasattr(c, 'target') and (c.target is not None): c.target.close() c.target = None c.close() m.close() return full or len(self.target_queue)
def load_urls(collection, shape=(8, 256, 256), max_retries=MAX_RETRIES): sockets = set() socket_events = [] timeout = 0 def _fsocket(event, socket, multi, data): if event == pycurl.POLL_REMOVE: sockets.remove(socket) else: if socket not in sockets: sockets.add(socket) socket_events.append((event, multi)) mc = pycurl.CurlMulti() mc.setopt(pycurl.M_SOCKETFUNCTION, _fsocket) #mc.setopt(pycurl.M_PIPELINING, True) nhandles = len(collection) results, cmap = {}, {} for url, token, index in collection: index = tuple(index) _curl, fp = _setup_curl(url, token, index) cmap[index] = (_curl, fp) mc.add_handle(_curl) while (pycurl.E_CALL_MULTI_PERFORM == mc.socket_all()[0]): pass timeout = mc.timeout() nprocessed = 0 while timeout >= 0: (rr, wr, er) = select.select(sockets, sockets, sockets, timeout / 1000.0) socketSet = set(rr + wr + er) if socketSet: for s in socketSet: while True: (ret, running) = mc.socket_all() if ret != pycurl.E_CALL_MULTI_PERFORM: break nq, suc, failed = mc.info_read() nprocessed += len(suc) for h in suc: _fp = cmap[h.index][-1] _fp.flush() _fp.close() try: arr = imread(_fp.name) if len(arr.shape) == 3: arr = np.rollaxis(arr, 2, 0) else: arr = np.expand_dims(arr, axis=0) except Exception as e: print(e) arr = np.zeros(shape, dtype=np.float32) finally: results[h.index] = arr h.close() mc.remove_handle(h) os.remove(_fp.name) for h, err_num, err_msg in failed: print('failed: {}, code={}, msg={}'.format( h.index, err_num, err_msg)) _fp = cmap[h.index][-1] _fp.flush() _fp.close() os.remove(_fp.name) h.close() mc.remove_handle(h) _curl, fp = _setup_curl(h.url, h.token, h.index) cmap[h.index] = (_curl, fp) mc.add_handle(_curl) else: (ret, running) = mc.socket_action(pycurl.SOCKET_TIMEOUT, 0) if running == 0: break mc.close() return results
if not url or url[0] == "#": continue filename = "doc_%03d.dat" % (len(queue) + 1) queue.append((url, filename)) # Check args assert queue, "no URLs given" num_urls = len(queue) num_conn = min(num_conn, num_urls) assert 1 <= num_conn <= 10000, "invalid number of concurrent connections" print("PycURL %s (compiled against 0x%x)" % (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)) print("----- Getting", num_urls, "URLs using", num_conn, "connections -----") # Pre-allocate a list of curl objects m = pycurl.CurlMulti() m.handles = [] for i in range(num_conn): c = pycurl.Curl() c.fp = None c.setopt(pycurl.FOLLOWLOCATION, 1) c.setopt(pycurl.MAXREDIRS, 5) c.setopt(pycurl.CONNECTTIMEOUT, 30) c.setopt(pycurl.TIMEOUT, 300) c.setopt(pycurl.NOSIGNAL, 1) m.handles.append(c) # Main loop freelist = m.handles[:] num_processed = 0 while num_processed < num_urls:
class DahuaEventThread(threading.Thread): """Connects to device and subscribes to events""" Devices = [] NumActivePlayers = 0 CurlMultiObj = pycurl.CurlMulti() NumCurlObjs = 0 def __init__(self, hass, config): """Construct a thread listening for events.""" self.hass = hass for device_cfg in config: url = URL_TEMPLATE.format(protocol=device_cfg.get("protocol"), host=device_cfg.get("host"), port=device_cfg.get("port"), events=device_cfg.get("events")) channels = device_cfg.get("channels") channels_dict = {} if channels is not None: for channel in channels: channels_dict[channel.get("number")] = channel.get("name") device = DahuaDevice(self, hass, device_cfg.get("name"), url, channels_dict) self.Devices.append(device) CurlObj = pycurl.Curl() device.CurlObj = CurlObj CurlObj.setopt(pycurl.URL, url) CurlObj.setopt(pycurl.CONNECTTIMEOUT, 30) CurlObj.setopt(pycurl.TCP_KEEPALIVE, 1) CurlObj.setopt(pycurl.TCP_KEEPIDLE, 30) CurlObj.setopt(pycurl.TCP_KEEPINTVL, 15) CurlObj.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_DIGEST) CurlObj.setopt( pycurl.USERPWD, "%s:%s" % (device_cfg.get("user"), device_cfg.get("password"))) CurlObj.setopt(pycurl.WRITEFUNCTION, device.OnReceive) self.CurlMultiObj.add_handle(CurlObj) self.NumCurlObjs += 1 _LOGGER.debug("Added Dahua device at: %s", url) threading.Thread.__init__(self) self.stopped = threading.Event() def run(self): """Fetch events""" while 1: Ret, NumHandles = self.CurlMultiObj.perform() if Ret != pycurl.E_CALL_MULTI_PERFORM: break Ret = self.CurlMultiObj.select(1.0) while not self.stopped.isSet(): # Sleeps to ease load on processor time.sleep(.05) Ret, NumHandles = self.CurlMultiObj.perform() if NumHandles != self.NumCurlObjs: _, Success, Error = self.CurlMultiObj.info_read() for CurlObj in Success: DahuaDevice = next( filter(lambda x: x.CurlObj == CurlObj, self.Devices)) if DahuaDevice.Reconnect: continue DahuaDevice.OnDisconnect("Success") DahuaDevice.Reconnect = time.time() + 5 for CurlObj, ErrorNo, ErrorStr in Error: DahuaDevice = next( filter(lambda x: x.CurlObj == CurlObj, self.Devices)) if DahuaDevice.Reconnect: continue DahuaDevice.OnDisconnect("{0} ({1})".format( ErrorStr, ErrorNo)) DahuaDevice.Reconnect = time.time() + 5 for DahuaDevice in self.Devices: if DahuaDevice.Reconnect and DahuaDevice.Reconnect < time.time( ): self.CurlMultiObj.remove_handle(DahuaDevice.CurlObj) self.CurlMultiObj.add_handle(DahuaDevice.CurlObj) DahuaDevice.Reconnect = None
def getdata(urls, ckey, cert, headers=None, options=None, num_conn=50, cookie=None): """ Get data for given list of urls, using provided number of connections and user credentials """ if not options: options = pycurl_options() # Make a queue with urls queue = [u for u in urls if validate_url(u)] # Check args num_urls = len(queue) num_conn = min(num_conn, num_urls) # Pre-allocate a list of curl objects mcurl = pycurl.CurlMulti() mcurl.handles = [] for _ in range(num_conn): curl = pycurl.Curl() curl.fp = None for key, val in viewitems(options): curl.setopt(getattr(pycurl, key), val) curl.setopt(pycurl.SSLKEY, ckey) curl.setopt(pycurl.SSLCERT, cert) mcurl.handles.append(curl) if headers: curl.setopt(pycurl.HTTPHEADER, \ ["%s: %s" % (k, v) for k, v in viewitems(headers)]) # Main loop freelist = mcurl.handles[:] num_processed = 0 while num_processed < num_urls: # If there is an url to process and a free curl object, # add to multi-stack while queue and freelist: url = queue.pop(0) curl = freelist.pop() curl.setopt(pycurl.URL, url.encode('ascii', 'ignore')) if cookie and url in cookie: curl.setopt(pycurl.COOKIEFILE, cookie[url]) curl.setopt(pycurl.COOKIEJAR, cookie[url]) bbuf = BytesIO() hbuf = BytesIO() curl.setopt(pycurl.WRITEFUNCTION, bbuf.write) curl.setopt(pycurl.HEADERFUNCTION, hbuf.write) mcurl.add_handle(curl) # store some info curl.hbuf = hbuf curl.bbuf = bbuf curl.url = url # Run the internal curl state machine for the multi stack while True: ret, _ = mcurl.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break # Check for curl objects which have terminated, and add them to the # freelist while True: num_q, ok_list, err_list = mcurl.info_read() for curl in ok_list: if sys.version.startswith('3.'): hdrs = curl.hbuf.getvalue().decode('utf-8') data = curl.bbuf.getvalue().decode('utf-8') else: hdrs = curl.hbuf.getvalue() data = curl.bbuf.getvalue() url = curl.url curl.bbuf.flush() curl.bbuf.close() curl.hbuf.close() curl.hbuf = None curl.bbuf = None mcurl.remove_handle(curl) freelist.append(curl) yield {'url': url, 'data': data, 'headers': hdrs} for curl, errno, errmsg in err_list: hdrs = curl.hbuf.getvalue() data = curl.bbuf.getvalue() url = curl.url curl.bbuf.flush() curl.bbuf.close() curl.hbuf.close() curl.hbuf = None curl.bbuf = None mcurl.remove_handle(curl) freelist.append(curl) yield {'url': url, 'data': None, 'headers': hdrs, \ 'error': errmsg, 'code': errno} num_processed = num_processed + len(ok_list) + len(err_list) if num_q == 0: break # Currently no more I/O is pending, could do something in the meantime # (display a progress bar, etc.). # We just call select() to sleep until some more data is available. mcurl.select(1.0) cleanup(mcurl)
def test_multi_close_twice(self): m = pycurl.CurlMulti() m.close() m.close()