def get_opener(): """Make `pycurl.Curl` objcet :return opener: :class:`pycurl.Curl` object """ opener = pycurl.Curl() opener.fp = None opener.setopt(pycurl.NOSIGNAL, 1) opener.dirty = False return opener
def test_get(self): c = pycurl.Curl() body_output = StringIO() headers_output = StringIO() url = build_url('get') c.setopt(pycurl.URL, build_url('get')) c.setopt(pycurl.WRITEFUNCTION, body_output.write) c.setopt(pycurl.HEADERFUNCTION, headers_output.write) c.perform() self.assertEquals(200, c.getinfo(pycurl.RESPONSE_CODE)) c.close() json_body = json.loads(body_output.getvalue()) self.assertEquals(json_body['url'], url) self.assertEquals(json_body['headers']['User-Agent'], "PycURL/%s" % pycurl.version_info()[1])
def get_ex_sentence(w): count = 1 f1 = codecs.open('%s.txt' % u'_'.join(w[0].split(' ')), 'wb', 'utf-8') f1.close() while True: c = pycurl2.Curl() if count == 1: url = 'http://ejje.weblio.jp/sentence/content/%s' % '+'.join( w[0].split(' ')) else: url = 'http://ejje.weblio.jp/sentence/content/%s/%d' % ('+'.join( w[0].split(' ')), count) print url count += 1 c.setopt(c.URL, str(url)) f = open('test.txt', 'wb') c.setopt(c.WRITEDATA, f) c.perform() c.reset() f.close() sentences = read_url(w) if sentences == '0' and count == 0: print 'Can\'t find any sentence\n' return None break elif sentences == '1': print 'Can\'t find enough sentences\n' return None break elif sentences == '0' and count != 0: if snt_total(w) > 10: return 1 break else: print 'Can\'t find enough sentences\n' return None break elif snt_total(w) > 40: return 1 break elif count >= 5: return 1 break
def test_head(self): c = pycurl.Curl() body_output = StringIO() headers_output = StringIO() url = build_url('head') c.setopt(pycurl.URL, url) c.setopt(pycurl.NOBODY, True) c.setopt(pycurl.WRITEFUNCTION, body_output.write) c.setopt(pycurl.HEADERFUNCTION, headers_output.write) c.perform() self.assertEquals(200, c.getinfo(pycurl.RESPONSE_CODE)) self.assertEquals(url, c.getinfo(pycurl.EFFECTIVE_URL)) c.close() self.assertEquals(len(body_output.getvalue()), 0) headers_list = headers_output.getvalue().split("\r\n") self.assertEquals(headers_list[0], "HTTP/1.1 200 OK")
def make_pycurl(cls, method, url, headers = None, data = None): header_output = BytesIO() body_output = BytesIO() c = pycurl.Curl() c.setopt(pycurl.URL, url) c.setopt(pycurl.NOSIGNAL, 1) if isinstance(headers, dict): c.setopt(pycurl.HTTPHEADER, ["%s: %s" % (capwords(f, "-"), v) for f, v in CaseInsensitiveDict(headers).iteritems()]) c.setopt(pycurl.CONNECTTIMEOUT, 3) c.setopt(pycurl.TIMEOUT, 3) if method in cls._curl_options.keys(): c.setopt(cls._curl_options[method], True) elif method in cls.SUPPORTED_METHODS: c.setopt(pycurl.CUSTOMREQUEST, method) if method in ("POST", "PUT"): if data is None: data = "" body_inout = BytesIO(data) c.setopt(pycurl.READFUNCTION, body_inout.read) def ioctl(cmd): if cmd == pycurl.IOCMD_RESTARTREAD: body_inout.seek(0) c.setopt(pycurl.IOCTLFUNCTION, ioctl) if method == "PUT": c.setopt(pycurl.PUT, True) c.setopt(pycurl.INFILESIZE, len(data)) else: c.setopt(pycurl.POST, True) c.setopt(pycurl.POSTFIELDSIZE, len(data)) c.setopt(pycurl.HEADERFUNCTION, header_output.write) c.setopt(pycurl.HEADERFUNCTION, body_output.write) return (c, header_output, body_output)
def multi_get(wf, urls, debug=0, num_conn=100, timeout=5, ua=None, ref=None, percentile=100, cf=None, follow=1, ref_dict=None): if ua is None: ua = 'multi_get' queue = [] wf_keys = dict.fromkeys(wf.keys(), 1) for url in dict.fromkeys(urls).keys(): url = url.strip() if len(url) > 250: wf[url] = '---' continue if not url or url[0] == "#" or url in wf_keys: continue filename = "[%03d]" % (len(queue) + 1) queue.append((url, filename)) if not queue: return num_urls = len(queue) num_conn = min(num_conn, num_urls) assert 1 <= num_conn <= 10000, "invalid number of concurrent connections" if debug: print "PycURL %s (compiled against 0x%x)" % ( pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM) if debug: print "----- Getting", num_urls, "URLs using", num_conn, "connections -----" m = pycurl.CurlMulti() m.handles = [] for i in range(num_conn): c = pycurl.Curl() c.fp = None if follow: c.setopt(pycurl.FOLLOWLOCATION, 1) c.setopt(pycurl.MAXREDIRS, 5) c.setopt(pycurl.CONNECTTIMEOUT, timeout) c.setopt(pycurl.TIMEOUT, timeout) c.setopt(pycurl.NOSIGNAL, 1) c.setopt(pycurl.USERAGENT, ua) if cf: c.setopt(pycurl.COOKIEFILE, cf) c.setopt(pycurl.COOKIEJAR, cf) if ref: c.setopt(pycurl.REFERER, ref) m.handles.append(c) from UserString import MutableString freelist = m.handles[:] num_processed = 0 bailout = 0 while num_processed < num_urls: if bailout: break while queue and freelist: url, filename = queue.pop(0) if '.pdf' not in url: c = freelist.pop() if type(url) == type(u''): url = url.encode('utf8', 'replace') c.setopt(pycurl.URL, url) c.res = cStringIO.StringIO() c.setopt(pycurl.WRITEFUNCTION, c.res.write) if ref_dict is not None: if ref_dict.get(url, ''): c.setopt(pycurl.REFERER, ref_dict.get(url, '')) m.add_handle(c) c.filename = filename c.url = url else: wf[url] = '---' num_urls -= 1 while 1: ret, num_handles = m.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break while 1: num_q, ok_list, err_list = m.info_read() for c in ok_list: c.fp = None m.remove_handle(c) text = c.res.getvalue() if len(text) > 100000: text = '' wf[c.url] = text try: if debug: print "[ ok] %5s %40s" % (c.filename, c.url[:40]) except: pass freelist.append(c) for c, errno, errmsg in err_list: c.fp = None m.remove_handle(c) if debug: print "[err] %5s %40s" % (c.filename, c.url[:40]) wf[c.url] = '---' freelist.append(c) num_processed = num_processed + len(ok_list) + len(err_list) if num_urls: if float(num_processed) / num_urls * 100 > percentile: bailout = 1 break if num_q == 0: break m.select(1.0) m.close()
from general import save_page, curl_init, crawlerlog import pycurl2 as pycurl import StringIO import datetime as dt import time import urllib if __name__ == '__main__': login_url = 'https://www.my089.com/login.aspx' detail_url = 'http://www.my089.com/Loan/Detail.aspx?sid=13090511403742990287210017241327' pycurl.global_init(pycurl.GLOBAL_ALL) curl = pycurl.Curl() # curl object init curl_init(curl) #set targer_url. curl.setopt(pycurl.URL, login_url) curl.setopt(pycurl.COOKIEFILE, 'cookie_file') post_param = { '__VIEWSTATE': '/wEPDwUKMTc3Mzg3OTU5Mw9kFgJmD2QWAmYPZBYCAgQPZBYCAgEPZBYCAgEPZBYEAgUPFgIeB1Zpc2libGVoFgICAQ8PFggeBElzSFpoHgNTSUQFJFNJRF84MDQ4NGFjNGQyZTc0NDUwYTM5OTgwMjdhNTEyYjNiNR4HVmVyc2lvbgUBTx8AaGQWAgICDxYEHgdvbmNsaWNrBXNXZWJGb3JtX0RvQ2FsbGJhY2soJ2N0bDAwJGN0bDAwJENvbnRlbnRQbGFjZUhvbGRlcjEkQ29udGVudFBsYWNlSG9sZGVyMSRSYW5kb21Db2RlMScsIiIsR2V0Q2hlY2tDb2RlLCIiLG51bGwsZmFsc2UpHgNzcmMFfy4uL2NvbW1vblBhZ2UvcnZjLmFzcHg/c2lkPUE1RThCNjdFQjM1RTM3NERCNUUyMDlDMzg2MjQ1MTM3NUVCRTA2ODIzOTJBMkRERUVCRThEN0JBRDMzOTZFNjYxNzdGNjVGNTNEMDg0RUE4JnI9MDcyMTIwMzUxJnY9TyZpPUZkAgcPDxYCHg1PbkNsaWVudENsaWNrBb8DaWYoIVZhbGlkYXRlTG9naW4oJ2N0bDAwX2N0bDAwX0NvbnRlbnRQbGFjZUhvbGRlcjFfQ29udGVudFBsYWNlSG9sZGVyMV90eHRVc2VyTmFtZScsJ2N0bDAwX2N0bDAwX0NvbnRlbnRQbGFjZUhvbGRlcjFfQ29udGVudFBsYWNlSG9sZGVyMV90eHRQd2QnKSkgcmV0dXJuIGZhbHNlO0VuY3J5cHRUZXh0KCdjdGwwMF9jdGwwMF9Db250ZW50UGxhY2VIb2xkZXIxX0NvbnRlbnRQbGFjZUhvbGRlcjFfdHh0UHdkJyk7IHZhciBoPWRvY3VtZW50LmdldEVsZW1lbnRCeUlkKCdjdGwwMF9jdGwwMF9Db250ZW50UGxhY2VIb2xkZXIxX0NvbnRlbnRQbGFjZUhvbGRlcjFfaGZwd2QnKTsgdmFyIHA9ZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQoJ2N0bDAwX2N0bDAwX0NvbnRlbnRQbGFjZUhvbGRlcjFfQ29udGVudFBsYWNlSG9sZGVyMV90eHRQd2QnKTsgaC52YWx1ZT1wLnZhbHVlO3AudmFsdWU9Jyc7ZGRk/Tr0LeEraW4ekqOQICduvYfIfZ8=', '__EVENTVALIDATION': '/wEWEALCoejmBgLVz5eVAwLFqsi1BQKC+8rVCAKJ9JOUBwKF9KOUBwKH9J+UBwKA9J+UBwKjtdhDAq21gEACqbXwQwKfyMGlAwLWm4jIDwKptbD+CwKFj72BCwK/w5iEDMTYGemgoCSMD87cPvxc/++ORM6h', 'ctl00$ctl00$ContentPlaceHolder1$ContentPlaceHolder1$txtUserName': '******', 'ctl00$ctl00$ContentPlaceHolder1$ContentPlaceHolder1$txtPwd': '7uko098I3', 'ctl00$ctl00$ContentPlaceHolder1$ContentPlaceHolder1$hfpwd': '7ecda547d0b615a0b01f7ff4747cf024' } print urllib.urlencode(post_param) curl.setopt(pycurl.POSTFIELDS, urllib.urlencode(post_param))
def build_opener(self, url, opener=None): """Compile pycurl.Curl instance Compile `pycurl.Curl` instance with given instance settings and return `pycurl.Curl` configured instance, BytesIO instances of body_output and headers_output :param url: resource url :return: an ``(opener, body_output, headers_output)`` tuple. """ # http://curl.haxx.se/mail/curlpython-2005-06/0004.html # http://curl.haxx.se/mail/lib-2010-03/0114.html opener = opener or pycurl.Curl() if getattr(opener, "dirty", True): opener = self.clean_opener(opener) logger.debug("Open url: %s" % url) opener.setopt(pycurl.URL, url) opener.setopt(pycurl.NOSIGNAL, 1) if isinstance(self._auth, AuthManager): self._auth.setup_request(self) self._auth.setup(opener) elif self._netrc: self.setup_netrc(opener) else: opener.unsetopt(pycurl.USERPWD) if self._headers: logger.debug("Setup custom headers %s" % "\r\n".join([ "%s: %s" % (f, v) for f, v in CaseInsensitiveDict(self._headers).items() ])) opener.setopt(pycurl.HTTPHEADER, [ "%s: %s" % (capwords(f, "-"), v) for f, v in CaseInsensitiveDict(self._headers).items() ]) # Option -L Follow "Location: " hints if self._allow_redirects is True: logger.debug("Allow redirects") opener.setopt(pycurl.FOLLOWLOCATION, self._allow_redirects) if self._max_redirects: opener.setopt(pycurl.MAXREDIRS, self._max_redirects) # Set timeout for a retrieving an object if self._timeout is not None: logger.debug("Set timeout: %s" % self._timeout) opener.setopt(pycurl.TIMEOUT, self._timeout) if self._connection_timeout is not None: logger.debug("Set connect timeout: %s" % self._timeout) opener.setopt(pycurl.CONNECTTIMEOUT, self._connection_timeout) # Setup debug output write function if isinstance(self._debug_curl, FunctionType): logger.debug("Setup %s as debug function" % self._debug_curl.__name__) opener.setopt(pycurl.VERBOSE, 1) opener.setopt(pycurl.DEBUGFUNCTION, self._debug_curl) elif self._debug_curl is True: opener.setopt(pycurl.VERBOSE, 1) opener.setopt(pycurl.DEBUGFUNCTION, logger_debug) else: opener.setopt(pycurl.VERBOSE, 0) # Send allow gzip encoding header if self._use_gzip is not None: logger.debug("Use gzip") opener.setopt(pycurl.ENCODING, "gzip,deflate") # Specify network interface (ip address) for query if self._network_interface is not None: logger.debug("Use custom network interface %s" % self._network_interface) opener.setopt(pycurl.INTERFACE, self._network_interface) # Setup proxy for request if self._proxy is not None: logger.debug("Use proxies %r" % (self._proxy, )) if len(self._proxy) > 2: proxy_type, proxy_addr, proxy_auth = self._proxy logger.debug("Use proxy with auth %s, %s, %s" % (proxy_type, proxy_addr, proxy_auth)) else: proxy_type, proxy_addr = self._proxy proxy_auth = None logger.debug("Use proxy withouth auth %s, %s" % (proxy_type, proxy_addr)) opener.setopt(pycurl.PROXY, proxy_addr[0]) opener.setopt(pycurl.PROXYPORT, proxy_addr[1]) opener.setopt(pycurl.PROXYTYPE, get_code_by_name(proxy_type)) if proxy_type.upper() in ("CONNECT", "SSL", "HTTPS"): # if CONNECT proxy, need use HTTPPROXYTINNEL opener.setopt(pycurl.HTTPPROXYTUNNEL, 1) if proxy_auth: if len(proxy_auth) == 2: opener.setopt(pycurl.PROXYUSERPWD, "%s:%s" % proxy_auth) opener.setopt(pycurl.PROXYAUTH, pycurl.HTTPAUTH_ANY) logger.debug("set PROXYAUTH = HTTPAUTH_ANY") logger.debug("set auth %s, %s" % proxy_auth) else: raise InterfaceError("Proxy auth data must be tuple") logger.debug("Setup user agent %s" % self.user_agent) opener.setopt(pycurl.USERAGENT, self.user_agent) if self._validate_cert not in (None, False): logger.debug("Validate certificate") # Verify that we've got the right site; harmless on a non-SSL connect. opener.setopt(pycurl.SSL_VERIFYPEER, 1) opener.setopt(pycurl.SSL_VERIFYHOST, 2) else: opener.setopt(pycurl.SSL_VERIFYPEER, 0) opener.setopt(pycurl.SSL_VERIFYHOST, 0) if self._ca_certs is not None: logger.debug("Use ca cert %s" % self._ca_certs) if file_exists(self._ca_certs): opener.setopt(pycurl.CAINFO, self._ca_certs) logger.debug("open ca cert file %s" % self._ca_certs) ## (HTTPS) Tells curl to use the specified certificate file when getting a ## file with HTTPS. The certificate must be in PEM format. ## If the optional password isn't specified, it will be queried for on the terminal. ## Note that this certificate is the private key and the private certificate concatenated! ## If this option is used several times, the last one will be used. if self._cert: logger.debug("Use cert %s" % self._cert) opener.setopt(pycurl.SSLCERT, self._cert) if self._ip_v6: logger.debug("ipresolve ip_v6") opener.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_WHATEVER) else: logger.debug("ipresolve ip_v4") opener.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) # opener.setopt(c.NOPROGRESS, 0) # opener.setopt(c.PROGRESSFUNCTION, self._progress_callback) # Add cookies from self._cookies if self._cookies is not None: chunks = [] for cookie in self._cookies: name, value = cookie.name, cookie.value ## if isinstance(name, unicode): ## name = name.encode("utf-8") ## if isinstance(value, unicode): ## value = value.encode("utf-8") name = quote_plus(name) value = quote_plus(value) chunks.append('%s=%s;' % (name, value)) if chunks: opener.setopt(pycurl.COOKIE, ''.join(chunks)) else: # set empty cookie to activate cURL cookies logger.debug("set empty cookie to activate cURL cookies") opener.setopt(pycurl.COOKIELIST, '') curl_options = { "GET": pycurl.HTTPGET, "POST": pycurl.POST, # "PUT": pycurl.UPLOAD, "PUT": pycurl.PUT, "HEAD": pycurl.NOBODY } logger.debug("Use method %s for request" % self._method) if self._method in list(curl_options.values()): opener.setopt(curl_options[self._method], True) elif self._method in self.SUPPORTED_METHODS: opener.setopt(pycurl.CUSTOMREQUEST, self._method) else: raise InvalidMethod("cURL request do not support %s" % self._method) # Responses without body if self._method in ("OPTIONS", "HEAD", "DELETE"): opener.setopt(pycurl.NOBODY, True) if self._method in ("POST", "PUT"): if self._files is not None: post_params = self._files if isinstance(self._data, (tuple, list, dict)): post_params.extend(data_wrapper(self._data)) opener.setopt(opener.HTTPPOST, post_params) else: if isinstance(self._data, str): logger.debug(("self._data is string")) logger.debug(("self._data", self._data)) request_buffer = BytesIO(self._data) # raw data for body request opener.setopt(pycurl.READFUNCTION, request_buffer.read) def ioctl(cmd): logger.debug(("cmd", cmd)) if cmd == pycurl.IOCMD_RESTARTREAD: request_buffer.seek(0) opener.setopt(pycurl.IOCTLFUNCTION, ioctl) if self._method == "PUT": opener.setopt(pycurl.PUT, True) opener.setopt(pycurl.INFILESIZE, len(self._data)) else: opener.setopt(pycurl.POST, True) opener.setopt(pycurl.POSTFIELDSIZE, len(self._data)) elif isinstance(self._data, (tuple, list, dict)): headers = dict(self._headers or []) if 'multipart' in headers.get('Content-Type', ''): # use multipart/form-data; opener.setopt(opener.HTTPPOST, data_wrapper(self._data)) else: # use postfields to send vars as application/x-www-form-urlencoded encoded_data = urlencode(self._data, doseq=True) opener.setopt(pycurl.POSTFIELDS, encoded_data) if isinstance(self._options, (tuple, list)): for key, value in self._options: logger.debug("set option %s=%s" % (key, value)) opener.setopt(key, value) self.body_output = BytesIO() self.headers_output = BytesIO() self.setup_writers(opener, self.headers_output.write, self.body_output.write) self._opener = opener return opener