Esempio n. 1
0
    def get_opener():
        """Make `pycurl.Curl` objcet

        :return opener: :class:`pycurl.Curl` object
        """
        opener = pycurl.Curl()
        opener.fp = None
        opener.setopt(pycurl.NOSIGNAL, 1)
        opener.dirty = False
        return opener
Esempio n. 2
0
    def test_get(self):
        c =  pycurl.Curl()
        body_output = StringIO()
        headers_output = StringIO()
        url = build_url('get')
        c.setopt(pycurl.URL, build_url('get'))
        c.setopt(pycurl.WRITEFUNCTION, body_output.write)
        c.setopt(pycurl.HEADERFUNCTION, headers_output.write)

        c.perform()
        self.assertEquals(200, c.getinfo(pycurl.RESPONSE_CODE))

        c.close()
        json_body = json.loads(body_output.getvalue())
        self.assertEquals(json_body['url'], url)
        self.assertEquals(json_body['headers']['User-Agent'], "PycURL/%s" % pycurl.version_info()[1])
def get_ex_sentence(w):
    count = 1
    f1 = codecs.open('%s.txt' % u'_'.join(w[0].split(' ')), 'wb', 'utf-8')
    f1.close()
    while True:
        c = pycurl2.Curl()
        if count == 1:
            url = 'http://ejje.weblio.jp/sentence/content/%s' % '+'.join(
                w[0].split(' '))
        else:
            url = 'http://ejje.weblio.jp/sentence/content/%s/%d' % ('+'.join(
                w[0].split(' ')), count)
        print url
        count += 1

        c.setopt(c.URL, str(url))
        f = open('test.txt', 'wb')
        c.setopt(c.WRITEDATA, f)
        c.perform()
        c.reset()
        f.close()

        sentences = read_url(w)
        if sentences == '0' and count == 0:
            print 'Can\'t find any sentence\n'
            return None
            break
        elif sentences == '1':
            print 'Can\'t find enough sentences\n'
            return None
            break
        elif sentences == '0' and count != 0:
            if snt_total(w) > 10:
                return 1
                break
            else:
                print 'Can\'t find enough sentences\n'
                return None
                break
        elif snt_total(w) > 40:
            return 1
            break
        elif count >= 5:
            return 1
            break
Esempio n. 4
0
    def test_head(self):
        c =  pycurl.Curl()
        body_output = StringIO()
        headers_output = StringIO()
        url = build_url('head')
        c.setopt(pycurl.URL, url)
        c.setopt(pycurl.NOBODY, True)
        c.setopt(pycurl.WRITEFUNCTION, body_output.write)
        c.setopt(pycurl.HEADERFUNCTION, headers_output.write)

        c.perform()

        self.assertEquals(200, c.getinfo(pycurl.RESPONSE_CODE))
        self.assertEquals(url, c.getinfo(pycurl.EFFECTIVE_URL))
        c.close()
        self.assertEquals(len(body_output.getvalue()), 0)
        headers_list = headers_output.getvalue().split("\r\n")
        self.assertEquals(headers_list[0], "HTTP/1.1 200 OK")
Esempio n. 5
0
    def make_pycurl(cls, method, url, headers = None, data = None):
        header_output = BytesIO()
        body_output = BytesIO()

        c = pycurl.Curl()
        c.setopt(pycurl.URL, url)
        c.setopt(pycurl.NOSIGNAL, 1)

        if isinstance(headers, dict):
            c.setopt(pycurl.HTTPHEADER, ["%s: %s" % (capwords(f, "-"), v) for f, v
                    in CaseInsensitiveDict(headers).iteritems()])

        c.setopt(pycurl.CONNECTTIMEOUT, 3)
        c.setopt(pycurl.TIMEOUT, 3)

        if method in cls._curl_options.keys():
            c.setopt(cls._curl_options[method], True)
        elif method in cls.SUPPORTED_METHODS:
            c.setopt(pycurl.CUSTOMREQUEST, method)

        if method in ("POST", "PUT"):
            if data is None:
                data = ""

            body_inout = BytesIO(data)
            c.setopt(pycurl.READFUNCTION, body_inout.read)
            def ioctl(cmd):
                if cmd == pycurl.IOCMD_RESTARTREAD:
                    body_inout.seek(0)

            c.setopt(pycurl.IOCTLFUNCTION, ioctl)
            if method == "PUT":
                c.setopt(pycurl.PUT, True)
                c.setopt(pycurl.INFILESIZE, len(data))
            else:
                c.setopt(pycurl.POST, True)
                c.setopt(pycurl.POSTFIELDSIZE, len(data))

            c.setopt(pycurl.HEADERFUNCTION, header_output.write)
            c.setopt(pycurl.HEADERFUNCTION, body_output.write)

        return (c, header_output, body_output)
Esempio n. 6
0
def multi_get(wf,
              urls,
              debug=0,
              num_conn=100,
              timeout=5,
              ua=None,
              ref=None,
              percentile=100,
              cf=None,
              follow=1,
              ref_dict=None):
    if ua is None:
        ua = 'multi_get'
    queue = []

    wf_keys = dict.fromkeys(wf.keys(), 1)

    for url in dict.fromkeys(urls).keys():
        url = url.strip()
        if len(url) > 250:
            wf[url] = '---'
            continue
        if not url or url[0] == "#" or url in wf_keys:
            continue
        filename = "[%03d]" % (len(queue) + 1)
        queue.append((url, filename))

    if not queue:
        return

    num_urls = len(queue)
    num_conn = min(num_conn, num_urls)
    assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
    if debug:
        print "PycURL %s (compiled against 0x%x)" % (
            pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)

    if debug:
        print "----- Getting", num_urls, "URLs using", num_conn, "connections -----"

    m = pycurl.CurlMulti()
    m.handles = []
    for i in range(num_conn):
        c = pycurl.Curl()
        c.fp = None
        if follow:
            c.setopt(pycurl.FOLLOWLOCATION, 1)
            c.setopt(pycurl.MAXREDIRS, 5)
        c.setopt(pycurl.CONNECTTIMEOUT, timeout)
        c.setopt(pycurl.TIMEOUT, timeout)
        c.setopt(pycurl.NOSIGNAL, 1)
        c.setopt(pycurl.USERAGENT, ua)
        if cf:
            c.setopt(pycurl.COOKIEFILE, cf)
            c.setopt(pycurl.COOKIEJAR, cf)

        if ref: c.setopt(pycurl.REFERER, ref)
        m.handles.append(c)

    from UserString import MutableString

    freelist = m.handles[:]
    num_processed = 0
    bailout = 0
    while num_processed < num_urls:
        if bailout: break
        while queue and freelist:
            url, filename = queue.pop(0)
            if '.pdf' not in url:
                c = freelist.pop()
                if type(url) == type(u''):
                    url = url.encode('utf8', 'replace')
                c.setopt(pycurl.URL, url)
                c.res = cStringIO.StringIO()
                c.setopt(pycurl.WRITEFUNCTION, c.res.write)
                if ref_dict is not None:
                    if ref_dict.get(url, ''):
                        c.setopt(pycurl.REFERER, ref_dict.get(url, ''))

                m.add_handle(c)
                c.filename = filename
                c.url = url
            else:
                wf[url] = '---'
                num_urls -= 1
        while 1:
            ret, num_handles = m.perform()
            if ret != pycurl.E_CALL_MULTI_PERFORM:
                break
        while 1:
            num_q, ok_list, err_list = m.info_read()
            for c in ok_list:
                c.fp = None
                m.remove_handle(c)

                text = c.res.getvalue()
                if len(text) > 100000: text = ''

                wf[c.url] = text

                try:
                    if debug: print "[ ok] %5s %40s" % (c.filename, c.url[:40])
                except:
                    pass

                freelist.append(c)
            for c, errno, errmsg in err_list:
                c.fp = None
                m.remove_handle(c)
                if debug: print "[err] %5s %40s" % (c.filename, c.url[:40])
                wf[c.url] = '---'
                freelist.append(c)
            num_processed = num_processed + len(ok_list) + len(err_list)
            if num_urls:
                if float(num_processed) / num_urls * 100 > percentile:
                    bailout = 1
                    break
            if num_q == 0:
                break
        m.select(1.0)

    m.close()
Esempio n. 7
0
from general import save_page, curl_init, crawlerlog
import pycurl2 as pycurl
import StringIO
import datetime as dt
import time
import urllib

if __name__ == '__main__':
    login_url = 'https://www.my089.com/login.aspx'
    detail_url = 'http://www.my089.com/Loan/Detail.aspx?sid=13090511403742990287210017241327'
    pycurl.global_init(pycurl.GLOBAL_ALL)
    curl = pycurl.Curl()
    # curl object init
    curl_init(curl)
    #set targer_url.
    curl.setopt(pycurl.URL, login_url)
    curl.setopt(pycurl.COOKIEFILE, 'cookie_file')
    post_param = {
        '__VIEWSTATE':
        '/wEPDwUKMTc3Mzg3OTU5Mw9kFgJmD2QWAmYPZBYCAgQPZBYCAgEPZBYCAgEPZBYEAgUPFgIeB1Zpc2libGVoFgICAQ8PFggeBElzSFpoHgNTSUQFJFNJRF84MDQ4NGFjNGQyZTc0NDUwYTM5OTgwMjdhNTEyYjNiNR4HVmVyc2lvbgUBTx8AaGQWAgICDxYEHgdvbmNsaWNrBXNXZWJGb3JtX0RvQ2FsbGJhY2soJ2N0bDAwJGN0bDAwJENvbnRlbnRQbGFjZUhvbGRlcjEkQ29udGVudFBsYWNlSG9sZGVyMSRSYW5kb21Db2RlMScsIiIsR2V0Q2hlY2tDb2RlLCIiLG51bGwsZmFsc2UpHgNzcmMFfy4uL2NvbW1vblBhZ2UvcnZjLmFzcHg/c2lkPUE1RThCNjdFQjM1RTM3NERCNUUyMDlDMzg2MjQ1MTM3NUVCRTA2ODIzOTJBMkRERUVCRThEN0JBRDMzOTZFNjYxNzdGNjVGNTNEMDg0RUE4JnI9MDcyMTIwMzUxJnY9TyZpPUZkAgcPDxYCHg1PbkNsaWVudENsaWNrBb8DaWYoIVZhbGlkYXRlTG9naW4oJ2N0bDAwX2N0bDAwX0NvbnRlbnRQbGFjZUhvbGRlcjFfQ29udGVudFBsYWNlSG9sZGVyMV90eHRVc2VyTmFtZScsJ2N0bDAwX2N0bDAwX0NvbnRlbnRQbGFjZUhvbGRlcjFfQ29udGVudFBsYWNlSG9sZGVyMV90eHRQd2QnKSkgcmV0dXJuIGZhbHNlO0VuY3J5cHRUZXh0KCdjdGwwMF9jdGwwMF9Db250ZW50UGxhY2VIb2xkZXIxX0NvbnRlbnRQbGFjZUhvbGRlcjFfdHh0UHdkJyk7IHZhciBoPWRvY3VtZW50LmdldEVsZW1lbnRCeUlkKCdjdGwwMF9jdGwwMF9Db250ZW50UGxhY2VIb2xkZXIxX0NvbnRlbnRQbGFjZUhvbGRlcjFfaGZwd2QnKTsgdmFyIHA9ZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQoJ2N0bDAwX2N0bDAwX0NvbnRlbnRQbGFjZUhvbGRlcjFfQ29udGVudFBsYWNlSG9sZGVyMV90eHRQd2QnKTsgaC52YWx1ZT1wLnZhbHVlO3AudmFsdWU9Jyc7ZGRk/Tr0LeEraW4ekqOQICduvYfIfZ8=',
        '__EVENTVALIDATION':
        '/wEWEALCoejmBgLVz5eVAwLFqsi1BQKC+8rVCAKJ9JOUBwKF9KOUBwKH9J+UBwKA9J+UBwKjtdhDAq21gEACqbXwQwKfyMGlAwLWm4jIDwKptbD+CwKFj72BCwK/w5iEDMTYGemgoCSMD87cPvxc/++ORM6h',
        'ctl00$ctl00$ContentPlaceHolder1$ContentPlaceHolder1$txtUserName':
        '******',
        'ctl00$ctl00$ContentPlaceHolder1$ContentPlaceHolder1$txtPwd':
        '7uko098I3',
        'ctl00$ctl00$ContentPlaceHolder1$ContentPlaceHolder1$hfpwd':
        '7ecda547d0b615a0b01f7ff4747cf024'
    }
    print urllib.urlencode(post_param)
    curl.setopt(pycurl.POSTFIELDS, urllib.urlencode(post_param))
Esempio n. 8
0
    def build_opener(self, url, opener=None):
        """Compile pycurl.Curl instance

        Compile `pycurl.Curl` instance with given instance settings
        and return `pycurl.Curl` configured instance, BytesIO instances
        of body_output and headers_output

        :param url: resource url
        :return: an ``(opener, body_output, headers_output)`` tuple.
        """
        # http://curl.haxx.se/mail/curlpython-2005-06/0004.html
        # http://curl.haxx.se/mail/lib-2010-03/0114.html

        opener = opener or pycurl.Curl()

        if getattr(opener, "dirty", True):
            opener = self.clean_opener(opener)

        logger.debug("Open url: %s" % url)
        opener.setopt(pycurl.URL, url)
        opener.setopt(pycurl.NOSIGNAL, 1)

        if isinstance(self._auth, AuthManager):
            self._auth.setup_request(self)
            self._auth.setup(opener)
        elif self._netrc:
            self.setup_netrc(opener)
        else:
            opener.unsetopt(pycurl.USERPWD)

        if self._headers:
            logger.debug("Setup custom headers %s" % "\r\n".join([
                "%s: %s" % (f, v)
                for f, v in CaseInsensitiveDict(self._headers).items()
            ]))
            opener.setopt(pycurl.HTTPHEADER, [
                "%s: %s" % (capwords(f, "-"), v)
                for f, v in CaseInsensitiveDict(self._headers).items()
            ])

        # Option -L  Follow  "Location: "  hints
        if self._allow_redirects is True:
            logger.debug("Allow redirects")
            opener.setopt(pycurl.FOLLOWLOCATION, self._allow_redirects)
            if self._max_redirects:
                opener.setopt(pycurl.MAXREDIRS, self._max_redirects)

        # Set timeout for a retrieving an object
        if self._timeout is not None:
            logger.debug("Set timeout: %s" % self._timeout)
            opener.setopt(pycurl.TIMEOUT, self._timeout)
        if self._connection_timeout is not None:
            logger.debug("Set connect timeout: %s" % self._timeout)
            opener.setopt(pycurl.CONNECTTIMEOUT, self._connection_timeout)

        # Setup debug output write function
        if isinstance(self._debug_curl, FunctionType):
            logger.debug("Setup %s as debug function" %
                         self._debug_curl.__name__)
            opener.setopt(pycurl.VERBOSE, 1)
            opener.setopt(pycurl.DEBUGFUNCTION, self._debug_curl)
        elif self._debug_curl is True:
            opener.setopt(pycurl.VERBOSE, 1)
            opener.setopt(pycurl.DEBUGFUNCTION, logger_debug)
        else:
            opener.setopt(pycurl.VERBOSE, 0)

        # Send allow gzip encoding header
        if self._use_gzip is not None:
            logger.debug("Use gzip")
            opener.setopt(pycurl.ENCODING, "gzip,deflate")

        # Specify network interface (ip address) for query
        if self._network_interface is not None:
            logger.debug("Use custom network interface %s" %
                         self._network_interface)
            opener.setopt(pycurl.INTERFACE, self._network_interface)

        # Setup proxy for request
        if self._proxy is not None:
            logger.debug("Use proxies %r" % (self._proxy, ))
            if len(self._proxy) > 2:
                proxy_type, proxy_addr, proxy_auth = self._proxy
                logger.debug("Use proxy with auth %s, %s, %s" %
                             (proxy_type, proxy_addr, proxy_auth))
            else:
                proxy_type, proxy_addr = self._proxy
                proxy_auth = None
                logger.debug("Use proxy withouth auth %s, %s" %
                             (proxy_type, proxy_addr))

            opener.setopt(pycurl.PROXY, proxy_addr[0])
            opener.setopt(pycurl.PROXYPORT, proxy_addr[1])
            opener.setopt(pycurl.PROXYTYPE, get_code_by_name(proxy_type))

            if proxy_type.upper() in ("CONNECT", "SSL", "HTTPS"):
                # if CONNECT proxy, need use HTTPPROXYTINNEL
                opener.setopt(pycurl.HTTPPROXYTUNNEL, 1)
            if proxy_auth:
                if len(proxy_auth) == 2:
                    opener.setopt(pycurl.PROXYUSERPWD, "%s:%s" % proxy_auth)
                    opener.setopt(pycurl.PROXYAUTH, pycurl.HTTPAUTH_ANY)
                    logger.debug("set PROXYAUTH = HTTPAUTH_ANY")
                    logger.debug("set auth %s, %s" % proxy_auth)
                else:
                    raise InterfaceError("Proxy auth data must be tuple")

        logger.debug("Setup user agent %s" % self.user_agent)
        opener.setopt(pycurl.USERAGENT, self.user_agent)

        if self._validate_cert not in (None, False):
            logger.debug("Validate certificate")
            # Verify that we've got the right site; harmless on a non-SSL connect.
            opener.setopt(pycurl.SSL_VERIFYPEER, 1)
            opener.setopt(pycurl.SSL_VERIFYHOST, 2)
        else:
            opener.setopt(pycurl.SSL_VERIFYPEER, 0)
            opener.setopt(pycurl.SSL_VERIFYHOST, 0)

        if self._ca_certs is not None:
            logger.debug("Use ca cert %s" % self._ca_certs)
            if file_exists(self._ca_certs):
                opener.setopt(pycurl.CAINFO, self._ca_certs)
                logger.debug("open ca cert file %s" % self._ca_certs)

        ## (HTTPS) Tells curl to use the specified certificate file when getting a
        ## file with HTTPS. The certificate must be in PEM format.
        ## If the optional password isn't specified, it will be queried for on the terminal.
        ## Note that this certificate is the private key and the private certificate concatenated!
        ## If this option is used several times, the last one will be used.
        if self._cert:
            logger.debug("Use cert %s" % self._cert)
            opener.setopt(pycurl.SSLCERT, self._cert)

        if self._ip_v6:
            logger.debug("ipresolve ip_v6")
            opener.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_WHATEVER)
        else:
            logger.debug("ipresolve ip_v4")
            opener.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4)

        # opener.setopt(c.NOPROGRESS, 0)
        # opener.setopt(c.PROGRESSFUNCTION, self._progress_callback)

        # Add cookies from self._cookies
        if self._cookies is not None:
            chunks = []
            for cookie in self._cookies:
                name, value = cookie.name, cookie.value
                ## if isinstance(name, unicode):
                ##     name = name.encode("utf-8")
                ## if isinstance(value, unicode):
                ##     value = value.encode("utf-8")
                name = quote_plus(name)
                value = quote_plus(value)
                chunks.append('%s=%s;' % (name, value))
            if chunks:
                opener.setopt(pycurl.COOKIE, ''.join(chunks))
        else:
            # set empty cookie to activate cURL cookies
            logger.debug("set empty cookie to activate cURL cookies")
            opener.setopt(pycurl.COOKIELIST, '')

        curl_options = {
            "GET": pycurl.HTTPGET,
            "POST": pycurl.POST,
            # "PUT": pycurl.UPLOAD,
            "PUT": pycurl.PUT,
            "HEAD": pycurl.NOBODY
        }

        logger.debug("Use method %s for request" % self._method)
        if self._method in list(curl_options.values()):
            opener.setopt(curl_options[self._method], True)
        elif self._method in self.SUPPORTED_METHODS:
            opener.setopt(pycurl.CUSTOMREQUEST, self._method)
        else:
            raise InvalidMethod("cURL request do not support %s" %
                                self._method)

        # Responses without body
        if self._method in ("OPTIONS", "HEAD", "DELETE"):
            opener.setopt(pycurl.NOBODY, True)

        if self._method in ("POST", "PUT"):
            if self._files is not None:
                post_params = self._files
                if isinstance(self._data, (tuple, list, dict)):
                    post_params.extend(data_wrapper(self._data))
                opener.setopt(opener.HTTPPOST, post_params)
            else:
                if isinstance(self._data, str):
                    logger.debug(("self._data is string"))
                    logger.debug(("self._data", self._data))
                    request_buffer = BytesIO(self._data)

                    # raw data for body request
                    opener.setopt(pycurl.READFUNCTION, request_buffer.read)

                    def ioctl(cmd):
                        logger.debug(("cmd", cmd))
                        if cmd == pycurl.IOCMD_RESTARTREAD:
                            request_buffer.seek(0)

                    opener.setopt(pycurl.IOCTLFUNCTION, ioctl)
                    if self._method == "PUT":
                        opener.setopt(pycurl.PUT, True)
                        opener.setopt(pycurl.INFILESIZE, len(self._data))
                    else:
                        opener.setopt(pycurl.POST, True)
                        opener.setopt(pycurl.POSTFIELDSIZE, len(self._data))
                elif isinstance(self._data, (tuple, list, dict)):
                    headers = dict(self._headers or [])
                    if 'multipart' in headers.get('Content-Type', ''):
                        # use multipart/form-data;
                        opener.setopt(opener.HTTPPOST,
                                      data_wrapper(self._data))
                    else:
                        # use postfields to send vars as application/x-www-form-urlencoded
                        encoded_data = urlencode(self._data, doseq=True)
                        opener.setopt(pycurl.POSTFIELDS, encoded_data)

        if isinstance(self._options, (tuple, list)):
            for key, value in self._options:
                logger.debug("set option %s=%s" % (key, value))
                opener.setopt(key, value)

        self.body_output = BytesIO()
        self.headers_output = BytesIO()

        self.setup_writers(opener, self.headers_output.write,
                           self.body_output.write)

        self._opener = opener

        return opener