Exemple #1
0
def get(url, cookies='', proxy=None, useragent='', referer='',
        headers=[], redirect_count=0, headers_only=False):
    """
    GET запрос
    Возвращаемые значения: см. функцию __request
    """
    err_counter = 0

    if redirect_count >= MAX_REDIRECTS:
        raise InfiniteRedirection(url)

    # Бесконечный цикл для hammer mode
    while True:

        try:
            result = __process_redirect(__request(url, request_type='get',
                                                  cookies=cookies,
                                                  proxy=proxy,
                                                  referer=referer,
                                                  useragent=useragent,
                                                  headers=headers,
                                                  redirect_count=redirect_count,
                                                  headers_only=headers_only))
            return result

        except DeadProxy as e:
            err_counter += 1
            if err_counter >= HAMMER_MODE_ATTEMPTS:
                raise DeadProxy(e['proxy'], e['port'])

        except pycurl.error as e:
            err_counter += 1
            if err_counter >= HAMMER_MODE_ATTEMPTS:
                raise pycurl.error(str(e))
Exemple #2
0
def post(url, data, cookies='', proxy=None, useragent='', referer='',
         headers=[], headers_only=False):
    """
    POST запрос
    data: dict
    Возвращаемые значения: см. функцию __request
    """
    err_counter = 0

    # Бесконечный цикл для hammer mode
    while True:
        try:
            result = __process_redirect(__request(url, request_type='post',
                                                  cookies=cookies,
                                                  referer=referer,
                                                  post_data=data,
                                                  useragent=useragent,
                                                  headers=headers,
                                                  proxy=proxy,
                                                  headers_only=headers_only))
            return result
        except DeadProxy as e:
            err_counter += 1
            if err_counter >= HAMMER_MODE_ATTEMPTS:
                raise DeadProxy(e['proxy'], e['port'])

        except pycurl.error as e:
            err_counter += 1
            if err_counter >= HAMMER_MODE_ATTEMPTS:
                raise pycurl.error(str(e))
Exemple #3
0
    def downloadWorker(self):
        logger.info('Starting download worker (retries=%d)' % self.retrylimit-1)
        while True:
            item = self.q.get()
            try:
                debug.log('Downloading %s to %s%s (retries=%d)' % (item['remoteurl'], item['downloaddir'], item['localfname'], self.retrylimit-1))
                fp = open(item['downloaddir'] + item['localfname'], 'wb')
                c = pycurl.Curl()
                c.setopt(c.URL, item['remoteurl'])
                c.setopt(c.WRITEDATA, fp)
                c.perform()
                if c.getinfo(pycurl.HTTP_CODE) != 200:
                    logger.error("FAILED to download %s: %d" % (item['remoteurl'], c.getinfo(pycurl.HTTP_CODE)))
                    raise pycurl.error()
                c.close()
                fp.close()
                self.downloadedsegs.append((item['order'], item['localfname']))
            except pycurl.error:
                logger.error('Caught exception while downloading %s' % item['remoteurl'])
                c.close()
                item['retries'] += 1
                if (item['retries'] < self.retrylimit):
                    logger.info('Retry counter is %d, will try again' % item['retries'])
                    self.q.put(item)
                else:
                    logger.error('Retry counter exceeded for %s' % item['localfname'])
                    self.failedDownloads = True

            finally:
                self.q.task_done()
Exemple #4
0
 def test_pycurl_error(self):
     curl = CurlStub(error=pycurl.error(60, "pycurl error"))
     try:
         fetch("http://example.com", curl=curl)
     except PyCurlError as error:
         self.assertEqual(error.error_code, 60)
         self.assertEqual(error.message, "pycurl error")
     else:
         self.fail("PyCurlError not raised")
Exemple #5
0
def progress(download_t, download_d, upload_t, upload_d):
    global count, start_at, timeout_max
    count = count + 1
    if count % 1000 == 0 and download_t > 0:
        r = download_d * 100.0 / download_t
        print "Total %d bytes, have %d bytes so far, %d%s" % (download_t, download_d, r, '%')
    ds = time.time() - start_at
    if timeout_max and ds >= timeout_max:
        raise pycurl.error(-1, u'download timeout. max=%s' % timeout_max)
Exemple #6
0
 def test_mocked_exception_duckduckgo(self):
     """What if curl raises an exception?"""
     # Arrange
     url = "duckduckgo.com/html"
     mock = MockResponse()
     mock.exception = pycurl.error()
     self.backend.responses.add(mock, url)
     # Act, Assert
     self.assertRaises(pycurl.error, self.browser.go, url)
Exemple #7
0
def download_file(file_key, file_vars=None, create_dirs=True):
    """Download a file from the web and save it to disk

    TODO: Remove when files.path() is removed

    Use pycurl (libcurl) to do the actual downloading. Request might be nicer for this, but turned out to be much
    slower (and in practice unusable for bigger files) and also not really supporting ftp-downloads.

    Args:
        file_key (String):   File key that should be downloaded.
        file_vars (Dict):    File variables used to find path from file_key.
        create_dirs (Bool):  Create directories as necessary before downloading file.
    """
    if (not config.where.files.download_missing.bool
            or "url" not in config.files[file_key]
            or not config.files[file_key].url.str):
        return None

    file_path = path(file_key, file_vars=file_vars, download_missing=False)
    if file_path.exists():
        return None
    if create_dirs:
        file_path.parent.mkdir(parents=True, exist_ok=True)

    file_url = url(file_key, file_vars=file_vars)
    file_path = file_path.with_name(file_url.name)
    log.info(f"Download {file_key} from '{file_url}' to '{file_path}'")
    with builtins.open(file_path, mode="wb") as fid:
        c = pycurl.Curl()
        c.setopt(c.URL, file_url)
        c.setopt(c.WRITEDATA, fid)
        try:
            c.perform()
            if not (200 <= c.getinfo(c.HTTP_CODE) <= 299):
                raise pycurl.error()
        except pycurl.error:
            log.error(
                f"Problem downloading file: {c.getinfo(c.EFFECTIVE_URL)} ({c.getinfo(c.HTTP_CODE)})"
            )
            if file_path.exists():  # Print first 10 lines to console
                head_of_file = f"Contents of '{file_path}':\n" + "\n".join(
                    file_path.read_text().split("\n")[:10])
                print(console.indent(head_of_file, num_spaces=8))
                file_path.unlink()
            log.warn(
                f"Try to download '{file_url}' manually and save it at '{file_path}'"
            )
        else:
            log.info(f"Done downloading {file_key}")
        finally:
            c.close()
    return file_path
def test_adapter_translates_from_pycurl_errors(error_code, error_msg,
                                               expected_exception):
    request = PreparedRequest()
    request.prepare(url="http://somefakeurl", method="GET", headers={})

    pool = FakePool()
    pool.add_exception(pycurl.error(error_code, error_msg))
    pool_provider = FakePoolProvider()
    pool_provider.add_pool_for_url(request.url, pool)

    adapter = CURLAdapter(
        pool_provider_factory=lambda *args, **kwargs: pool_provider)

    with pytest.raises(expected_exception):
        adapter.send(request)
Exemple #9
0
def download(request):
    request_url = request.get("url")
    headers = request.get("headers")
    if isinstance(headers, dict):
        headers = [k + ":" + v for k, v in headers.items()]
    proxies = request.get("proxies")
    mothed = request.get("mothed")

    c = pycurl.Curl()
    body = BytesIO()
    c.setopt(pycurl.VERBOSE, True)
    c.setopt(pycurl.HEADER, False)
    c.setopt(pycurl.TIMEOUT, 3)
    c.setopt(pycurl.CONNECTTIMEOUT, 1)
    c.setopt(pycurl.URL, request_url)
    if headers:
        print(headers)
        c.setopt(pycurl.HTTPHEADER, headers)
    c.setopt(pycurl.ENCODING, 'gzip,deflate')
    c.setopt(pycurl.SSL_VERIFYPEER, False)
    c.setopt(pycurl.SSL_VERIFYHOST, False)
    if mothed is None:
        mothed = "get"
    if mothed.lower() == "post":
        c.setopt(pycurl.POST, 1)
        data = request.get("data")
        if data:
            c.setopt(pycurl.POSTFIELDS, data)
    c.setopt(pycurl.WRITEFUNCTION, body.write)
    if proxies:
        proxy, password = convert_proxy_format(proxies)
        c.setopt(pycurl.PROXY, proxy)
        c.setopt(pycurl.PROXYUSERPWD, password)
    try:
        c.perform()
        code = c.getinfo(pycurl.RESPONSE_CODE)
        content = c.getinfo(pycurl.CONTENT_TYPE)
        if code != 200:
            raise pycurl.error(code, "")
    except pycurl.error as err:
        print(repr(err))
        raise err
    finally:
        c.close()
    return body.getvalue().decode("gbk")
 def request(self, verb, path, body, headers):
     c = self.curl
     hdrs = [str(h + ": " + v)
             for h, v in six.iteritems(headers)] if headers else []
     verb = verb.upper()
     if verb == 'GET':
         if self.cleaning_needed:
             c.setopt(pycurl.POST, 0)
             c.unsetopt(pycurl.CUSTOMREQUEST)
             c.setopt(pycurl.NOBODY, 0)
             self.cleaning_needed = False
         if body:
             self.cleaning_needed = True
             c.setopt(pycurl.POST, 0)
             c.setopt(pycurl.CUSTOMREQUEST, verb)
             c.setopt(pycurl.NOBODY, 0)
             c.setopt(pycurl.POSTFIELDS, body or "")
     elif verb == 'POST':
         self.cleaning_needed = True
         c.unsetopt(pycurl.CUSTOMREQUEST)
         c.setopt(pycurl.NOBODY, 0)
         c.setopt(pycurl.POST, 1)
         c.setopt(pycurl.POSTFIELDS, body or "")
         hdrs.append("Expect:")
     elif verb == 'PUT' or verb == "DELETE":
         self.cleaning_needed = True
         c.setopt(pycurl.POST, 0)
         c.setopt(pycurl.CUSTOMREQUEST, verb)
         c.setopt(pycurl.NOBODY, 0)
         c.setopt(pycurl.POSTFIELDS, body or "")
     elif verb == 'HEAD':
         self.cleaning_needed = True
         c.setopt(pycurl.POST, 0)
         c.unsetopt(pycurl.CUSTOMREQUEST)
         c.setopt(pycurl.NOBODY, 1)
     else:
         raise pycurl.error("unsupported verb: " + verb)
     c.setopt(pycurl.URL, str(self.prefix + path))
     c.setopt(pycurl.HTTPHEADER, hdrs)
     self.buf = BytesIO()
     self.response_headers = []
     c.setopt(pycurl.WRITEFUNCTION, self.buf.write)
     c.setopt(pycurl.HEADERFUNCTION, self._header_handler)
     c.perform()
Exemple #11
0
def download(request):
    headers = [
        "User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
    ]
    while True:
        c = pycurl.Curl()
        body = StringIO.StringIO()
        c.setopt(pycurl.TIMEOUT, 5)
        #c.setopt(pycurl.CONNECTTIMEOUT, 1)
        c.setopt(pycurl.URL, request['comURL'])
        c.setopt(pycurl.HTTPHEADER, headers)
        c.setopt(pycurl.ENCODING, 'gzip,deflate')
        c.setopt(pycurl.SSL_VERIFYPEER, 0)
        c.setopt(pycurl.SSL_VERIFYHOST, 0)
        #c.setopt(pycurl.WRITEHEADER, headers)
        c.setopt(pycurl.WRITEFUNCTION, body.write)
        #c.setopt(pycurl.PROXY, "http://127.0.0.1:8888")                #Fiddler
        #c.setopt(pycurl.PROXYUSERPWD, self.userpwd)

        if 'formdata' in request:
            postfields = urllib.urlencode(request['formdata'])
            c.setopt(pycurl.POST, 1)
            c.setopt(pycurl.POSTFIELDS, postfields)
        if 'postfields' in request:
            c.setopt(pycurl.POST, 1)
            c.setopt(pycurl.POSTFIELDS, request['postfields'])

        try:
            c.perform()
            code = c.getinfo(pycurl.RESPONSE_CODE)
            if code != 200:
                raise pycurl.error(code, "")
            break
        except pycurl.error as err:
            if err[0] in (7, 28, 56):
                continue
            else:
                print('{}, {}, {}'.format(time.strftime('%H:%M:%S'), err[0],
                                          err[1]))
                raise err
        finally:
            c.close()

    return body.getvalue()
Exemple #12
0
def get(url,
        cookies='',
        proxy=None,
        useragent='',
        referer='',
        headers=[],
        redirect_count=0,
        headers_only=False):
    """
    GET запрос
    Возвращаемые значения: см. функцию __request
    """
    err_counter = 0

    if redirect_count >= MAX_REDIRECTS:
        raise InfiniteRedirection(url)

    # Бесконечный цикл для hammer mode
    while True:

        try:
            result = __process_redirect(
                __request(url,
                          request_type='get',
                          cookies=cookies,
                          proxy=proxy,
                          referer=referer,
                          useragent=useragent,
                          headers=headers,
                          redirect_count=redirect_count,
                          headers_only=headers_only))
            return result

        except DeadProxy as e:
            err_counter += 1
            if err_counter >= HAMMER_MODE_ATTEMPTS:
                raise DeadProxy(e['proxy'], e['port'])

        except pycurl.error as e:
            err_counter += 1
            if err_counter >= HAMMER_MODE_ATTEMPTS:
                raise pycurl.error(str(e))
Exemple #13
0
def post(url,
         data,
         cookies='',
         proxy=None,
         useragent='',
         referer='',
         headers=[],
         headers_only=False):
    """
    POST запрос
    data: dict
    Возвращаемые значения: см. функцию __request
    """
    err_counter = 0

    # Бесконечный цикл для hammer mode
    while True:
        try:
            result = __process_redirect(
                __request(url,
                          request_type='post',
                          cookies=cookies,
                          referer=referer,
                          post_data=data,
                          useragent=useragent,
                          headers=headers,
                          proxy=proxy,
                          headers_only=headers_only))
            return result
        except DeadProxy as e:
            err_counter += 1
            if err_counter >= HAMMER_MODE_ATTEMPTS:
                raise DeadProxy(e['proxy'], e['port'])

        except pycurl.error as e:
            err_counter += 1
            if err_counter >= HAMMER_MODE_ATTEMPTS:
                raise pycurl.error(str(e))
Exemple #14
0
    def execute_multi(self, calls, timeout):

        multi = pycurl.CurlMulti()
        for request in calls:
            multi.add_handle(request._curl)

        while True:
            while True:
                ret, num = multi.perform()
                if ret != pycurl.E_CALL_MULTI_PERFORM:
                    break
            if num == 0:
                break
            if multi.select(timeout) < 0:
                raise pycurl.error(pycurl.E_OPERATION_TIMEOUTED)

        failed_calls = []

        for request in calls:
            multi.remove_handle(request._curl)

            request._response_content = request._process_http_request()

            if request.response_code() == 0:
                failed_calls.append(request)
            else:
                if request._response_content:
                    request.process()

                error_string = request.error()
                if error_string:
                    self._errors.append(error_string)

        multi.close()

        return failed_calls
Exemple #15
0
    def execute_multi(self, calls, timeout):

        multi = pycurl.CurlMulti()
        for request in calls:
            multi.add_handle(request._curl)

        while True:
            while True:
                ret, num = multi.perform()
                if ret != pycurl.E_CALL_MULTI_PERFORM:
                    break
            if num == 0:
                break
            if multi.select(timeout) < 0:
                raise pycurl.error(pycurl.E_OPERATION_TIMEOUTED)

        failed_calls = []

        for request in calls:
            multi.remove_handle(request._curl)

            request._response_content = request._process_http_request()

            if request.response_code() == 0:
                failed_calls.append(request)
            else:
                if request._response_content:
                    request.process()

                error_string = request.error()
                if error_string:
                    self._errors.append(error_string)

        multi.close()

        return failed_calls
Exemple #16
0
    def _download(self, chunks, resume):
        if not resume:
            self.info.clear()
            self.info.add_chunk('{0}.chunk0'.format(
                self.path), (0, 0))  # create an initial entry

        self.chunks = []

        # initial chunk that will load complete file (if needed)
        init = CurlChunk(0, self, None, resume)

        self.chunks.append(init)
        self.manager.add_handle(init.get_handle())

        last_finish_check = 0
        last_time_check = 0
        chunks_done = set()  # list of curl handles that are finished
        chunks_created = False
        done = False
        # This is a resume, if we were chunked originally assume still can
        if self.info.get_count() > 1:
            self.chunk_support = True

        while True:
            # need to create chunks
            # will be set later by first chunk
            if not chunks_created and self.chunk_support and self.size:

                self.flags ^= Connection.Resumable  # TODO: Recheck...
                if not resume:
                    self.info.set_size(self.size)
                    self.info.create_chunks(chunks)
                    self.info.save()

                chunks = self.info.get_count()

                init.set_range(self.info.get_chunk_range(0))

                for i in range(1, chunks):
                    c = CurlChunk(
                        i, self, self.info.get_chunk_range(i), resume)

                    handle = c.get_handle()
                    if handle:
                        self.chunks.append(c)
                        self.manager.add_handle(handle)
                    else:
                        # close immediately
                        self.log.debug('Invalid curl handle -> closed')
                        c.close()

                chunks_created = True

            while True:
                ret, _ = self.manager.perform()
                if ret != pycurl.E_CALL_MULTI_PERFORM:
                    break

            t = time.time()

            # reduce these calls
            # when num_q is 0, the loop is exited
            while last_finish_check + 0.5 < t:
                # list of failed curl handles
                failed = []

                # TODO: Rewrite...
                # save only last exception, we can only raise one anyway
                exc = Exception()

                num_q, ok_list, err_list = self.manager.info_read()
                for c in ok_list:
                    chunk = self.find_chunk(c)
                    # check if the header implies success,
                    # else add it to failed list
                    try:
                        chunk.verify_header()
                    except ResponseException as exc:
                        self.log.debug(
                            'Chunk {0:d} failed'.format(
                                chunk.id + 1))
                        self.log.debug(exc, exc_info=True)
                        failed.append(chunk)
                    else:
                        chunks_done.add(c)

                for c in err_list:
                    curl, errno, msg = c
                    chunk = self.find_chunk(curl)
                    # test if chunk was finished
                    if errno != 23 or '0 !=' not in msg:
                        failed.append(chunk)
                        exc = pycurl.error(errno, msg)
                        self.log.debug(
                            'Chunk {0:d} failed'.format(chunk.id + 1))
                        self.log.debug(exc, exc_info=True)
                        continue
                    # check if the header implies success,
                    # else add it to failed list
                    try:
                        chunk.verify_header()
                    except ResponseException as exc:
                        self.log.debug(
                            'Chunk {0:d} failed'.format(
                                chunk.id + 1))
                        self.log.debug(exc, exc_info=True)
                        failed.append(chunk)
                    else:
                        chunks_done.add(curl)
                if not num_q:  # no more info to get

                    # check if init is not finished so we reset download
                    # connections
                    # note that other chunks are closed and everything
                    # downloaded with initial connection
                    if failed:
                        if init in failed or init.curl in chunks_done:
                            raise exc
                        self.log.error(
                            'Download chunks failed, fallback to '
                            'single connection | {0}'.format(exc))

                        # list of chunks to clean and remove
                        to_clean = [x for x in self.chunks if x is not init]
                        for chunk in to_clean:
                            self.close_chunk(chunk)
                            self.chunks.remove(chunk)
                            remove(self.info.get_chunk_name(chunk.id))

                        # let first chunk load the rest and update the
                        # info file
                        init.reset_range()
                        self.info.clear()
                        self.info.add_chunk('{0}.chunk0'.format(
                            self.path), (0, self.size))
                        self.info.save()

                    last_finish_check = t

                    if len(chunks_done) >= len(self.chunks):
                        if len(chunks_done) > len(self.chunks):
                            self.log.warning(
                                'Finished download chunks size incorrect')
                        done = True  # all chunks loaded

                    break

            if done:
                break  # all chunks loaded

            # calc speed once per second, averaging over 3 seconds
            if last_time_check + 1 < t:
                len_la = len(self.last_arrived)
                diff = [c.arrived - (self.last_arrived[i] if len_la > i else 0)
                        for i, c in enumerate(self.chunks)]

                self.last_speeds[1] = self.last_speeds[0]
                self.last_speeds[0] = self.speeds
                self.speeds = [float(a) // (t - last_time_check) for a in diff]
                self.last_arrived = [c.arrived for c in self.chunks]
                last_time_check = t

            if self._abort:
                raise Abort

            self.manager.select(1)

        for chunk in self.chunks:
            chunk.flush_file()  # make sure downloads are written to disk

        self._copy_chunks()
Exemple #17
0
class GetUrlTestCase(unittest.TestCase):
    @mock.patch('source.lib.__init__.make_pycurl_request', mock.Mock(side_effect=['ololo', error()]))
    def test_if_pycurl_error_second(self):
        with mock.patch('source.lib.__init__.logger', mock.Mock()) as m_loger:
            result = source.lib.__init__.get_url('ololo.ru', 42)
            self.assertEqual(1, m_loger.error.call_count)
            self.assertEqual('ololo.ru', result[0])
            self.assertEqual('ERROR', result[1])
            self.assertEqual(None, result[2])

    @mock.patch('source.lib.__init__.make_pycurl_request', mock.Mock(side_effect=['ololo', ValueError()]))
    def test_if_value_error_second(self):
        with mock.patch('source.lib.__init__.logger', mock.Mock()) as m_loger:
            result = source.lib.__init__.get_url('ololo.ru', 42)
            self.assertEqual(1, m_loger.error.call_count)
            self.assertEqual('ololo.ru', result[0])
            self.assertEqual('ERROR', result[1])
            self.assertEqual(None, result[2])

    @mock.patch('source.lib.__init__.make_pycurl_request', mock.Mock(side_effect=[ValueError(), 'ololo']))
    def test_if_value_error_first(self):
        with mock.patch('source.lib.__init__.logger', mock.Mock()) as m_loger:
            result = source.lib.__init__.get_url('ololo.ru', 42)
            self.assertEqual(1, m_loger.error.call_count)
            self.assertEqual('ololo.ru', result[0])
            self.assertEqual('ERROR', result[1])
            self.assertEqual(None, result[2])

    @mock.patch('source.lib.__init__.make_pycurl_request', mock.Mock(side_effect=[error(), 'ololo']))
    def test_if_pycurl_error_first(self):
        with mock.patch('source.lib.__init__.logger', mock.Mock()) as m_loger:
            result = source.lib.__init__.get_url('ololo.ru', 42)
            self.assertEqual(1, m_loger.error.call_count)
            self.assertEqual('ololo.ru', result[0])
            self.assertEqual('ERROR', result[1])
            self.assertEqual(None, result[2])

    @mock.patch('source.lib.__init__.make_pycurl_request', mock.Mock(return_value=['ish', 'ololo.ru']))
    def test_if_new_redirect_url_and_match(self):
        with (mock.patch('source.lib.__init__.OK_REDIRECT', mock.Mock())):
            with (mock.patch('source.lib.__init__.OK_REDIRECT.match', mock.Mock(return_value=True))):
                result = source.lib.__init__.get_url('vk.ru', 42)
                self.assertEqual(None, result[0])
                self.assertEqual(None, result[1])
                self.assertEqual('ish', result[2])

    @mock.patch('source.lib.__init__.make_pycurl_request', mock.Mock(return_value=['ish', 'ololo.ru']))
    @mock.patch('source.lib.__init__.check_for_meta', mock.Mock(return_value=None))
    @mock.patch('source.lib.__init__.prepare_url', mock.Mock(return_value=None))
    def test_redirect_url_and_not(self):
        with (mock.patch('source.lib.__init__.OK_REDIRECT', mock.Mock())):
            with (mock.patch('source.lib.__init__.OK_REDIRECT.match', mock.Mock(return_value=False))):
                result = source.lib.__init__.get_url('vk.ru', 42)
                self.assertEqual(None, result[0])
                self.assertEqual(source.lib.__init__.REDIRECT_HTTP, result[1])
                self.assertEqual('ish', result[2])

    @mock.patch('source.lib.__init__.make_pycurl_request', mock.Mock(return_value=['ish', None]))
    @mock.patch('source.lib.__init__.check_for_meta', mock.Mock(return_value='ololo.ru'))
    @mock.patch('source.lib.__init__.prepare_url', mock.Mock(return_value='vk.com'))
    def test_not_redirect_url_and_redirect_url_and_not_urlsplit(self):
        urlsplit = mock.MagicMock()
        urlsplit.scheme = mock.Mock(return_value='bugaga')
        with (mock.patch('source.lib.__init__.OK_REDIRECT', mock.Mock())):
            with (mock.patch('source.lib.__init__.OK_REDIRECT.match', mock.Mock(return_value=False))):
                result = source.lib.__init__.get_url('vk.ru', 42)
                self.assertEqual(source.lib.__init__.REDIRECT_META, result[1])
                self.assertEqual('ish', result[2])

    @mock.patch('source.lib.__init__.make_pycurl_request', mock.Mock(return_value=['ish', None]))
    @mock.patch('source.lib.__init__.check_for_meta', mock.Mock(return_value=None))
    @mock.patch('source.lib.__init__.prepare_url', mock.Mock(return_value='vk.com'))
    def test_not_redirect_url_and_not_redirect_url_and_not_urlsplit(self):
        urlsplit = mock.Mock()
        urlsplit.scheme = 'market'
        with (mock.patch('source.lib.__init__.OK_REDIRECT', mock.Mock())):
            with (mock.patch('source.lib.__init__.OK_REDIRECT.match', mock.Mock(return_value=False))):
                with (mock.patch('source.lib.__init__.urlsplit', mock.Mock(return_value=urlsplit))):
                    result = source.lib.__init__.get_url('vk.ru', 42)
                    self.assertEqual(None, result[1])
                    self.assertEqual('ish', result[2])

    @mock.patch('source.lib.__init__.make_pycurl_request', mock.Mock(return_value=['ish', 'ololo']))
    @mock.patch('source.lib.__init__.prepare_url', mock.Mock(return_value='vk.com'))
    def test_redirect_url_and_urlsplit(self):
        urlsplit = mock.Mock()
        urlsplit.scheme = 'market'
        with (mock.patch('source.lib.__init__.OK_REDIRECT', mock.Mock())):
            with (mock.patch('source.lib.__init__.OK_REDIRECT.match', mock.Mock(return_value=False))):
                with (mock.patch('source.lib.__init__.urlsplit', mock.Mock(return_value=urlsplit))):
                    with (mock.patch('source.lib.__init__.fix_market_url', mock.Mock())) as m_fix:
                        result = source.lib.__init__.get_url('vk.ru', 42)
                        self.assertEqual('ish', result[2])
                        self.assertEqual(source.lib.__init__.REDIRECT_HTTP, result[1])
                        self.assertEqual(1, m_fix.call_count)
    def setopt(self, method='GET', ua='', cookies=None, proxy=None, url=None, verbose=False, headers=None, timeout=120, data=None, allow_redirect=False):
        '''
        @proxy protocol://host:port eg: socks5://127.0.0.1:1080
        '''
        self._curl.reset()
        method = method.upper() 

        if method not in ("GET", "POST", "DELETE", "PUT", "OPTIONS", "HEAD"):
            raise pycurl.error("not support method:%s" % method)

        if method in ("HEAD", "DELETE"):
            self._curl.setopt(pycurl.NOBODY, False)

        if method in ("POST", "PUT"):
            self._curl.setopt(pycurl.POST, True)

        if method in ("PUT", "DELETE", "PUT", "OPTIONS"):
            self._curl.setopt(pycurl.CUSTOMREQUEST, method)

        self._curl.setopt(pycurl.NOSIGNAL, True)
        self._curl.setopt(pycurl.URL, url)

        self.setproxy(proxy)
        
        if verbose:
                self._curl.setopt(pycurl.VERBOSE, True)

        allheaders = []

        self.setcookies(cookies)

        if headers:
            curl_headers = self._header2curlstyle(headers)
            allheaders.extend(curl_headers)

        if ua:
            self.ua = ua

        allheaders.extend(["User-Agent: %s" % self.ua])

        if allheaders:
            self._curl.setopt(pycurl.HTTPHEADER, allheaders)

        if method in ("POST", "PUT"):
            if isinstance(data, str):
                self._curl.setopt(pycurl.POSTFIELDS, data)
            elif hasattr(data, "read"):
                self._curl.setopt(pycurl.UPLOAD, True)
                self._curl.setopt(pycurl.READFUNCTION, data.read)
                data.seek(0, 2)
                filesize = data.tell()
                data.seek(0)
                self._curl.setopt(pycurl.INFILESIZE, filesize)
            elif isinstance(data, dict):
                postfields = self._dict2urlfields(data)
                self._curl.setopt(pycurl.POSTFIELDS, postfields)

        self._curl.setopt(pycurl.TIMEOUT, timeout)
        # if self.cookiefile:
            # self._curl.setopt(pycurl.COOKIEJAR, self.cookiefile)
            # self._curl.setopt(pycurl.COOKIEFILE, self.cookiefile)
        self._curl.setopt(pycurl.HEADERFUNCTION, self.headerfunc)
        self._curl.setopt(pycurl.WRITEFUNCTION, self.contentfunc)

        if allow_redirect:
            self._curl.setopt(pycurl.FOLLOWLOCATION, 1)
            self._curl.setopt(pycurl.MAXREDIRS, 5) 
Exemple #19
0
def __request(url, request_type, cookies='', post_data={}, proxy=None,
              headers=[], useragent='', referer='', redirect_count=0,
              attempt=1, headers_only=False):
    """
    Универсальная функция. Используется в функциях get & post
    Возвращает:
        Словарь заголовков: dict
        Тело: string
        Cookies: string; query string: a=1&b=2
        Connect time: float
        Current URL: string
        Redirect URL: string | none
        Redirect count: integer
    """

    # сливаем переданые в функции и глобальные заголовки
    all_headers = HEADERS + headers
    all_headers = __headers_to_dict(all_headers, replace_duplicates=True)
    all_headers = ["%s: %s" % (k, v) for k, v in all_headers.items()]

    c = pycurl.Curl()

    got_headers = StringIO()
    body = StringIO()

    if headers_only:
        c.setopt(pycurl.NOBODY, 1)
    else:
        c.setopt(pycurl.WRITEFUNCTION, body.write)

    c.setopt(pycurl.URL, url)
    c.setopt(pycurl.TIMEOUT, TIMEOUT)
    c.setopt(pycurl.HEADERFUNCTION, got_headers.write)

    """
    If it is 1, libcurl will not use any functions that install signal
    handlers or any functions that cause signals to be sent to the
    process. This option is mainly here to allow multi-threaded unix
    applications to still set/use all timeout options etc, without risking
    getting signals
    """
    c.setopt(pycurl.NOSIGNAL, 1)

    # более приоритетный юзер-агент - это переданый в функцию
    # менее приоритетный - это юзер-агент установленный глобально
    if not useragent:
        if USERAGENT:
            useragent = USERAGENT

    if useragent:
        c.setopt(pycurl.USERAGENT, useragent)

    if all_headers:
        c.setopt(pycurl.HTTPHEADER, all_headers)

    # Установка referer'a
    if referer:
        c.setopt(pycurl.REFERER, referer)


    # Если код ответа >= 400, то вызываем ошибку
    c.setopt(pycurl.FAILONERROR, 1)

    # ВАЖНО: т.к. куки не сохраняем в файлах, а передаём строкой
    # то FOLLOWLOCATION не будет использовать куки, присвоенные
    # сразу до редиректа. Отказываемся от его использования
    #
    # c.setopt(pycurl.FOLLOWLOCATION, 1)
    c.setopt(pycurl.COOKIE, cookies)
    c.setopt(pycurl.VERBOSE, 1)
    c.setopt(pycurl.DEBUGFUNCTION, __logging)

    # не проверяем SSL сертификат. Запросы становястя уязвимы к атаке MITM
    c.setopt(pycurl.SSL_VERIFYHOST, 0)
    c.setopt(pycurl.SSL_VERIFYPEER, 0)

    if request_type.lower() == 'post' and post_data:
        c.setopt(pycurl.HTTPPOST, post_data.items())

    # Если передан прокси, то работаем через него
    if proxy:
        # CURL proxytype
        if PROXY_TYPE == 'socks5':
            proxy_type = pycurl.PROXYTYPE_SOCKS5
        elif PROXY_TYPE == 'socks4':
            proxy_type = pycurl.PROXYTYPE_SOCKS4
        elif PROXY_TYPE == 'http':
            proxy_type = pycurl.PROXYTYPE_HTTP

        # если не можем отделить ip и порт, то возвращаем ошибку
        try:
            proxy_ip, port = proxy.split(':')
            port = int(port)
        except ValueError:
            logging.error("Возможно, неверный формат прокси: %s", str(proxy))
            raise DeadProxy(proxy_ip, port)

        c.setopt(pycurl.PROXY, proxy_ip)
        c.setopt(pycurl.PROXYPORT, port)
        c.setopt(pycurl.PROXYTYPE, proxy_type)

    # Обработка исключений при загрузке страницы
    try:
        c.perform()
    except pycurl.error as err:
        """
        CURLE_HTTP_RETURNED_ERROR (22)
        This is returned if CURLOPT_FAILONERROR is set TRUE and the HTTP
        server returns an error code that is >= 400.
        """
        if err[0] == 22:
            raise WrongCode(c.getinfo(pycurl.RESPONSE_CODE))

        """
        Если используем прокси, то все ошибки, кроме неверного кода ответа
        спихиваем на него
        """
        if proxy:
            raise DeadProxy(proxy_ip, port)
        else:
            raise pycurl.error(str(err))

    # словарь
    got_headers = __get_headers(got_headers.getvalue())

    result = {'headers': got_headers,
              'body': body.getvalue(),
              'current_proxy': proxy,
              'useragent': useragent,
              'referer': referer,
              'sent_headers': all_headers,
              'cookies': __get_cookies(got_headers['Set-Cookie'], cookies),
              'connect_time': c.getinfo(pycurl.CONNECT_TIME),
              'response_code': c.getinfo(pycurl.RESPONSE_CODE),
              'current_url': c.getinfo(pycurl.EFFECTIVE_URL),
              'redirect_url': c.getinfo(pycurl.REDIRECT_URL),
              'redirect_count': redirect_count,
              'headers_only': headers_only}
    c.close()
    del c

    return result
Exemple #20
0
    def _download(self, chunks, resume):
        if not resume:
            self.info.clear()
            self.info.add_chunk("{0}.chunk0".format(self.path),
                                (0, 0))  #: create an initial entry

        self.chunks = []

        # initial chunk that will load complete file (if needed)
        init = CurlChunk(0, self, None, resume)

        self.chunks.append(init)
        self.manager.add_handle(init.get_handle())

        last_finish_check = 0
        last_time_check = 0
        chunks_done = set()  #: list of curl handles that are finished
        chunks_created = False
        done = False
        if self.info.get_count(
        ) > 1:  #: This is a resume, if we were chunked originally assume still can
            self.chunk_support = True

        while True:
            # need to create chunks
            if not chunks_created and self.chunk_support and self.size:  #: will be set later by first chunk

                self.flags ^= Connection.Resumable
                if not resume:
                    self.info.set_size(self.size)
                    self.info.create_chunks(chunks)
                    self.info.save()

                chunks = self.info.get_count()

                init.set_range(self.info.get_chunk_range(0))

                for i in range(1, chunks):
                    c = CurlChunk(i, self, self.info.get_chunk_range(i),
                                  resume)

                    handle = c.get_handle()
                    if handle:
                        self.chunks.append(c)
                        self.manager.add_handle(handle)
                    else:
                        # close immediately
                        self.pyload.log.debug("Invalid curl handle -> closed")
                        c.close()

                chunks_created = True

            while True:
                ret, num_handles = self.manager.perform()
                if ret != pycurl.E_CALL_MULTI_PERFORM:
                    break

            t = time()

            # reduce these calls
            # when num_q is 0, the loop is exited
            while last_finish_check + 0.5 < t:
                # list of failed curl handles
                failed = []
                ex = None  #: save only last exception, we can only raise one anyway

                num_q, ok_list, err_list = self.manager.info_read()
                for c in ok_list:
                    chunk = self.find_chunk(c)
                    try:  #: check if the header implies success, else add it to failed list
                        chunk.verify_header()
                    except ResponseException as e:
                        self.pyload.log.debug("Chunk {0:d} failed: {1}".format(
                            chunk.id + 1, str(e)))
                        failed.append(chunk)
                        ex = e
                    else:
                        chunks_done.add(c)

                for c in err_list:
                    curl, errno, msg = c
                    chunk = self.find_chunk(curl)
                    # test if chunk was finished
                    if errno != 23 or "0 !=" not in msg:
                        failed.append(chunk)
                        ex = pycurl.error(errno, msg)
                        self.pyload.log.debug("Chunk {0:d} failed: {1}".format(
                            chunk.id + 1, ex))
                        continue

                    try:  #: check if the header implies success, else add it to failed list
                        chunk.verify_header()
                    except ResponseException as e:
                        self.pyload.log.debug("Chunk {0:d} failed: {1}".format(
                            chunk.id + 1, str(e)))
                        failed.append(chunk)
                        ex = e
                    else:
                        chunks_done.add(curl)
                if not num_q:  #: no more info to get

                    # check if init is not finished so we reset download connections
                    # note that other chunks are closed and everything
                    # downloaded with initial connection
                    if failed and init not in failed and init.c not in chunks_done:
                        self.pyload.log.error(
                            _("Download chunks failed, fallback to single connection | {0}"
                              .format(ex)))

                        # list of chunks to clean and remove
                        to_clean = [x for x in self.chunks if x is not init]
                        for chunk in to_clean:
                            self.close_chunk(chunk)
                            self.chunks.remove(chunk)
                            remove(
                                format.path(self.info.get_chunk_name(
                                    chunk.id)))

                        # let first chunk load the rest and update the info
                        # file
                        init.reset_range()
                        self.info.clear()
                        self.info.add_chunk("{0}.chunk0".format(self.path),
                                            (0, self.size))
                        self.info.save()
                    elif failed:
                        raise ex

                    last_finish_check = t

                    if len(chunks_done) >= len(self.chunks):
                        if len(chunks_done) > len(self.chunks):
                            self.pyload.log.warning(
                                _("Finished download chunks size incorrect, please report bug"
                                  ))
                        done = True  #: all chunks loaded

                    break

            if done:
                break  #: all chunks loaded

            # calc speed once per second, averaging over 3 seconds
            if last_time_check + 1 < t:
                diff = [
                    c.arrived -
                    (self.last_arrived[i] if len(self.last_arrived) > i else 0)
                    for i, c in enumerate(self.chunks)
                ]

                self.last_speeds[1] = self.last_speeds[0]
                self.last_speeds[0] = self.speeds
                self.speeds = [float(a) // (t - last_time_check) for a in diff]
                self.last_arrived = [c.arrived for c in self.chunks]
                last_time_check = t

            if self.do_abort:
                raise Abort

            self.manager.select(1)

        for chunk in self.chunks:
            chunk.flush_file()  #: make sure downloads are written to disk

        self._copy_chunks()
Exemple #21
0
    def _download(self, chunks, resume):
        if not resume:
            self.info.clear()
            self.info.add_chunk(f"{self.filename}.chunk0",
                                (0, 0))  #: create an initial entry)

        self.chunks = []

        # initial chunk that will load complete file (if needed)
        init = HTTPChunk(0, self, None, resume)

        self.chunks.append(init)
        self.m.add_handle(init.get_handle())

        last_finish_check = 0
        last_time_check = 0
        chunks_done = set()  #: list of curl handles that are finished
        chunks_created = False
        done = False
        if (
                self.info.get_count() > 1
        ):  #: This is a resume, if we were chunked originally assume still can
            self.chunk_support = True

        while True:
            # need to create chunks
            if (not chunks_created and self.chunk_support
                    and self.size):  #: will be set later by first chunk

                if not resume:
                    self.info.set_size(self.size)
                    self.info.create_chunks(chunks)
                    self.info.save()

                chunks = self.info.get_count()

                init.set_range(self.info.get_chunk_range(0))

                for i in range(1, chunks):
                    c = HTTPChunk(i, self, self.info.get_chunk_range(i),
                                  resume)

                    handle = c.get_handle()
                    if handle:
                        self.chunks.append(c)
                        self.m.add_handle(handle)
                    else:
                        # close immediately
                        self.log.debug("Invalid curl handle -> closed")
                        c.close()

                chunks_created = True

            while True:
                ret, num_handles = self.m.perform()
                if ret != pycurl.E_CALL_MULTI_PERFORM:
                    break

            t = time.time()

            # reduce these calls
            while last_finish_check + 0.5 < t:
                # list of failed curl handles
                failed = []
                ex = None  #: save only last exception, we can only raise one anyway

                num_q, ok_list, err_list = self.m.info_read()
                for c in ok_list:
                    chunk = self.find_chunk(c)
                    try:  #: check if the header implies success, else add it to failed list
                        chunk.verify_header()
                    except BadHeader as exc:
                        self.log.debug(f"Chunk {chunk.id + 1} failed: {exc}")
                        failed.append(chunk)
                        ex = exc
                    else:
                        self.log.debug(
                            f"Chunk {chunk.id + 1} download finished")
                        chunks_done.add(c)

                for c in err_list:
                    curl, errno, msg = c
                    chunk = self.find_chunk(curl)
                    # test if chunk was finished
                    if errno != pycurl.E_WRITE_ERROR or not chunk.aborted:
                        failed.append(chunk)
                        ex = pycurl.error(errno, msg)
                        self.log.debug(f"Chunk {chunk.id + 1} failed: {ex}")
                        continue

                    try:  #: check if the header implies success, else add it to failed list
                        chunk.verify_header()
                    except BadHeader as exc:
                        self.log.debug(f"Chunk {chunk.id + 1} failed: {exc}")
                        failed.append(chunk)
                        ex = exc
                    else:
                        self.log.debug(
                            f"Chunk {chunk.id + 1} download finished")
                        chunks_done.add(curl)
                if not num_q:  #: no more infos to get

                    # check if init is not finished so we reset download connections
                    # note that other chunks are closed and downloaded with init too
                    if failed and init not in failed and init.c not in chunks_done:
                        self.log.error(
                            f"Download chunks failed, fallback to single connection | {ex}"
                        )

                        # list of chunks to clean and os.remove
                        to_clean = [x for x in self.chunks if x is not init]
                        for chunk in to_clean:
                            self.close_chunk(chunk)
                            self.chunks.remove(chunk)
                            os.remove(self.info.get_chunk_name(chunk.id))

                        # let first chunk load the rest and update the info file
                        init.reset_range()
                        self.info.clear()
                        self.info.add_chunk(f"{self.filename}.chunk0",
                                            (0, self.size))
                        self.info.save()
                    elif failed:
                        raise ex or Exception

                    last_finish_check = t

                    if len(chunks_done) >= len(self.chunks):
                        if len(chunks_done) > len(self.chunks):
                            self.log.warning(
                                "Finished download chunks size incorrect, please report bug."
                            )
                        done = True  #: all chunks loaded

                    break

            if done:
                break  #: all chunks loaded

            # calc speed once per second, averaging over 3 seconds
            if last_time_check + 1 < t:
                diff = [
                    c.arrived -
                    (self.last_arrived[i] if len(self.last_arrived) > i else 0)
                    for i, c in enumerate(self.chunks)
                ]

                self.last_speeds[1] = self.last_speeds[0]
                self.last_speeds[0] = self.speeds
                self.speeds = [float(a) / (t - last_time_check) for a in diff]
                self.last_arrived = [c.arrived for c in self.chunks]
                last_time_check = t
                self.update_progress()

            if self.abort:
                raise Abort

            # time.sleep(0.003) #supress busy waiting - limits dl speed to  (1 / x) *
            # buffersize
            self.m.select(1)

        for chunk in self.chunks:
            chunk.flush_file()  #: make sure downloads are written to disk

        self._copy_chunks()
Exemple #22
0
    def download(self,url):
        c = pycurl.Curl()#实例化对象
        c.setopt(pycurl.URL, url)
        c.setopt(pycurl.USERAGENT, self.userAgent)#伪装成某代理
        c.setopt(pycurl.CONNECTTIMEOUT, self.connectionTimeout)#链接超时
        c.setopt(pycurl.TIMEOUT, self.operationTimeout)#操作超时
        c.setopt(pycurl.ENCODING, self.encoding)#encoding
        c.setopt(pycurl.HTTPHEADER, self.headerParams)#Accept信息
        c.setopt(pycurl.DNS_CACHE_TIMEOUT,60)#设置保存DNS信息的时间,默认为60秒 
        if self.referer:
            c.setopt(pycurl.REFERER, self.referer)#告诉服务器从哪个连接过来的。
        if self.httpAuth:#auth
            c.setopt(pycurl.HTTPAUTH, self.httpAuthType)#request header中authorization需要这个信息
            c.setopt(pycurl.USERPWD, self.httpAuth)
        if self.agency.has_key('ADD') and self.agency.has_key('PWD'):#agency
            c.setopt(pycurl.PROXY,self.agency['ADD'])
            c.setopt(pycurl.PROXYUSERPWD,self.agency['PWD'])#浏览器代理
        if self.COOKIE:#cookie
            c.setopt(pycurl.COOKIEFILE, self.COOKIE)
            c.setopt(pycurl.COOKIEJAR, self.COOKIE)
        if self.POST:#post
            if self.DATA:
                Data=self.DATA
            else:
                print 'DATA is necessary when request type is POST'
                return False
            c.setopt(pycurl.POST, 1)
            c.setopt(pycurl.POSTFIELDS, Data)
            self.POST=0
        
        self.headerWrite=StringIO.StringIO()#通过回调函数,不断把response header信息写入.(StringIO作为'内存文件'对象)       
        if self.isDownload:#download
            self.b = open(self.isDownload, 'wb')#把文件下载到DownLoad这个文件中
        else:
            self.b = StringIO.StringIO()#声明一个StringIO对象,在内存缓冲区写入内容
        
        ######################################################以下三个需要调用回调函数  
        c.setopt(pycurl.WRITEFUNCTION, self.contentWriteCallBack)#把网页内容写入
        c.setopt(pycurl.HEADERFUNCTION, self.headerWriteCallBack)#将头部信息(response header)写入
        if self.isDownload:#下载或上传时回调函数
            c.setopt(pycurl.NOPROGRESS, self.noProgress)
            c.setopt(pycurl.PROGRESSFUNCTION, self.progressWriteCallBack)#把进度输出
        ######################################################     
        
        c.setopt(pycurl.FOLLOWLOCATION, 1)#支持重定向  即http code 是3开头的
        c.setopt(pycurl.MAXREDIRS,10)#最大重定向数为10,防止爬虫陷阱
        c.setopt(pycurl.SSL_VERIFYPEER, 0)#SSL certificate   
        c.setopt(pycurl.SSL_VERIFYHOST, 0)#SSL certificate
        
        c.setopt(pycurl.NOSIGNAL, 1)#This option is here to allow multi-threaded unix applications to still set/use all timeout options etc, without risking getting signals.
        #c.setopt(pycurl.FRESH_CONNECT,1)#强制获取新的连接,即替代缓存中的连接
        #c.setopt(pycurl.FORBID_REUSE, 1)#完成交互后强制断开连接,不重用
        #c.setopt(pycurl.SOCKET_TIMEOUT, 9)
        #c.setopt(pycurl.E_OPERATION_TIMEOUTED, 3600)
 
        #开始访问
        try:
            c.perform()#执行curl
            
            #以下为抓包的一些详细信息.以毫秒为单位
            dnsTime =  c.getinfo(c.NAMELOOKUP_TIME)*1000#域名解析时间
            connectTimeTemp =  c.getinfo(c.CONNECT_TIME)*1000#远程服务器连接时间
            preTransferTimeTemp =   c.getinfo(c.PRETRANSFER_TIME)*1000#从建立连接到准备传输所消耗的时间
            startTransferTimeTemp = c.getinfo(c.STARTTRANSFER_TIME)*1000#从准备传输到传输第一个字节消耗的时间
            totalTime = c.getinfo(c.TOTAL_TIME)*1000#上一请求总的时间
            connectTime = connectTimeTemp - dnsTime
            transferTime = totalTime - preTransferTimeTemp
            
            #print 'dnsTime:',dnsTime
            #print 'connectTime:',connectTime
            #print 'transferTime:',transferTime
            #print 'totalTime:',totalTime
            
            if self.isDownload:
                c.close()
                self.b.close()#关闭文件
                self.isDownload = False
                return True
            
            self.headerContent=self.headerWrite.getvalue()#访问结束后,从response headers获取返回值
            self.httpCode=c.getinfo(c.HTTP_CODE)#HTTP状态码
            contenttype = re.compile('charset=(.*)',re.I|re.S|re.M).findall(c.getinfo(c.CONTENT_TYPE))#c.getinfo(c.CONTENT_TYPE)的返回值为text/html; charset=utf-8
            if contenttype:
                webCharset = contenttype[0]
            else:
                webCharset = self.charset
            
            value= self.b.getvalue()
            if webCharset.lower() !='utf-8':
                #print 'encoding to utf-8...'
                commonutil = CommonUtil()
                value = commonutil.convertCoding(webCharset,self.charset,value)
            c.close()
            self.b.close()
            
            if self.httpCode>=400:
                errorMessages = re.compile('<title>(.*)</title>',re.I|re.S|re.M).findall(value)
                if errorMessages:
                    errorMessage = errorMessages[0]
                else:
                    errorMessage = 'Http code >=400'
                raise pycurl.error(errorMessage.decode('UTF-8'))#decode是为了处理中文
        except pycurl.error, e:
            print sys.exc_info()[0],sys.exc_info()[1]#打印错误信息
            return False
Exemple #23
0
def test_translate_curl_exception(error_code, error_msg, expected_exception):
    curl_exception = pycurl.error(error_code, error_msg)
    translated_exception = translate_curl_exception(curl_exception)
    assert translated_exception == expected_exception
 def test_handle_response_pycurl_error(self):
     """PycURLGetter allows other errors to propagate."""
     error = pycurl.error(pycurl.E_MULTI_OUT_OF_MEMORY)
     getter = PycURLGetter(FakeCurl(perform_error=error))
     with self.assertRaises(pycurl.error):
         getter.handle_response()
 def test_cert_verification_failed(self):
     """Cert verification error raises CertificateVerificationFailed."""
     error = pycurl.error(pycurl.E_SSL_CACERT)
     getter = PycURLGetter(FakeCurl(perform_error=error))
     with self.assertRaises(CertificateVerificationFailed):
         getter.handle_response()
 def test_handle_response_connection_error(self):
     """On connection error, handle_response raises CouldNotConnect."""
     error = pycurl.error(pycurl.E_COULDNT_CONNECT)
     getter = PycURLGetter(FakeCurl(perform_error=error))
     with self.assertRaises(CouldNotConnect):
         getter.handle_response()
Exemple #27
0
    def download(self, request):
        request_url = request.get("url")
        headers = request.get("headers")
        if isinstance(headers, dict):
            headers = [k + ":" + v for k, v in headers.items()]
        proxies = request.get("proxy")
        mothed = request.get("mothed")
        encoding = request.get("encoding")

        c = pycurl.Curl()
        body = BytesIO()
        if self.pycurl_config:
            #default
            c.setopt(pycurl.FOLLOWLOCATION, 1)
            c.setopt(pycurl.MAXREDIRS, 5)
            c.setopt(pycurl.TIMEOUT, 3)
            c.setopt(pycurl.CONNECTTIMEOUT, 1)
            c.setopt(pycurl.URL, request_url)
            if headers:
                c.setopt(pycurl.HTTPHEADER, headers)
            c.setopt(pycurl.ENCODING, 'gzip,deflate')
            c.setopt(pycurl.SSL_VERIFYPEER, False)
            c.setopt(pycurl.SSL_VERIFYHOST, False)
            if mothed is None:
                mothed = "get"
            if mothed.lower() == "post":
                c.setopt(pycurl.POST, 1)
                data = request.get("data")
                if data:
                    c.setopt(pycurl.POSTFIELDS, urllib.urlencode(data))
            c.setopt(pycurl.WRITEFUNCTION, body.write)
            if self.use_proxy:
                if proxies:
                    proxy, password = self.convert_proxy_format(proxies)
                    self.log.debug((proxy, password))
                    c.setopt(pycurl.PROXY, proxy)
                    c.setopt(pycurl.PROXYUSERPWD, password)
                else:
                    if self.used_proxy:
                        proxy, password = self.convert_proxy_format(
                            self.used_proxy)
                        self.log.debug((proxy, password))
                        c.setopt(pycurl.PROXY, proxy)
                        c.setopt(pycurl.PROXYUSERPWD, password)
            #set pycurl_config
            for k, v in self.pycurl_config.items():
                c.setopt(k, v)
            # set yourself
            self.overwrite_download_opt(c)
        else:
            c.setopt(pycurl.FOLLOWLOCATION, 1)
            c.setopt(pycurl.MAXREDIRS, 5)
            c.setopt(pycurl.TIMEOUT, 3)
            c.setopt(pycurl.CONNECTTIMEOUT, 1)
            c.setopt(pycurl.URL, request_url)
            if headers:
                c.setopt(pycurl.HTTPHEADER, headers)
            c.setopt(pycurl.ENCODING, 'gzip,deflate')
            c.setopt(pycurl.SSL_VERIFYPEER, False)
            c.setopt(pycurl.SSL_VERIFYHOST, False)
            if mothed is None:
                mothed = "get"
            if mothed.lower() == "post":
                c.setopt(pycurl.POST, 1)
                data = request.get("data")
                if data:
                    c.setopt(pycurl.POSTFIELDS, urllib.urlencode(data))
            c.setopt(pycurl.WRITEFUNCTION, body.write)
            if self.use_proxy:
                if proxies:
                    proxy, password = self.convert_proxy_format(proxies)
                    self.log.debug((proxy, password))
                    c.setopt(pycurl.PROXY, proxy)
                    c.setopt(pycurl.PROXYUSERPWD, password)
                else:
                    if self.used_proxy:
                        proxy, password = self.convert_proxy_format(
                            self.used_proxy)
                        self.log.debug((proxy, password))
                        c.setopt(pycurl.PROXY, proxy)
                        c.setopt(pycurl.PROXYUSERPWD, password)

            self.overwrite_download_opt(c)
        try:
            c.perform()
            code = c.getinfo(pycurl.HTTP_CODE)
            if code != 200:
                raise pycurl.error(code, "")
        except pycurl.error as err:
            #if err[0] not in (7,28,56):
            #   self.log.error(err)
            self.log.exception(err)
            #raise err
            return ""
        finally:
            c.close()
        result = body.getvalue()
        if not encoding:
            coding = chardet.detect(result)['encoding']
        return result.decode(coding)
Exemple #28
0
def __request(url,
              request_type,
              cookies='',
              post_data={},
              proxy=None,
              headers=[],
              useragent='',
              referer='',
              redirect_count=0,
              attempt=1,
              headers_only=False):
    """
    Универсальная функция. Используется в функциях get & post
    Возвращает:
        Словарь заголовков: dict
        Тело: string
        Cookies: string; query string: a=1&b=2
        Connect time: float
        Current URL: string
        Redirect URL: string | none
        Redirect count: integer
    """

    # сливаем переданые в функции и глобальные заголовки
    all_headers = HEADERS + headers
    all_headers = __headers_to_dict(all_headers, replace_duplicates=True)
    all_headers = ["%s: %s" % (k, v) for k, v in all_headers.items()]

    c = pycurl.Curl()

    got_headers = StringIO()
    body = StringIO()

    if headers_only:
        c.setopt(pycurl.NOBODY, 1)
    else:
        c.setopt(pycurl.WRITEFUNCTION, body.write)

    c.setopt(pycurl.URL, url)
    c.setopt(pycurl.TIMEOUT, TIMEOUT)
    c.setopt(pycurl.HEADERFUNCTION, got_headers.write)
    """
    If it is 1, libcurl will not use any functions that install signal
    handlers or any functions that cause signals to be sent to the
    process. This option is mainly here to allow multi-threaded unix
    applications to still set/use all timeout options etc, without risking
    getting signals
    """
    c.setopt(pycurl.NOSIGNAL, 1)

    # более приоритетный юзер-агент - это переданый в функцию
    # менее приоритетный - это юзер-агент установленный глобально
    if not useragent:
        if USERAGENT:
            useragent = USERAGENT

    if useragent:
        c.setopt(pycurl.USERAGENT, useragent)

    if all_headers:
        c.setopt(pycurl.HTTPHEADER, all_headers)

    # Установка referer'a
    if referer:
        c.setopt(pycurl.REFERER, referer)

    # Если код ответа >= 400, то вызываем ошибку
    c.setopt(pycurl.FAILONERROR, 1)

    # ВАЖНО: т.к. куки не сохраняем в файлах, а передаём строкой
    # то FOLLOWLOCATION не будет использовать куки, присвоенные
    # сразу до редиректа. Отказываемся от его использования
    #
    # c.setopt(pycurl.FOLLOWLOCATION, 1)
    c.setopt(pycurl.COOKIE, cookies)
    c.setopt(pycurl.VERBOSE, 1)
    c.setopt(pycurl.DEBUGFUNCTION, __logging)

    # не проверяем SSL сертификат. Запросы становястя уязвимы к атаке MITM
    c.setopt(pycurl.SSL_VERIFYHOST, 0)
    c.setopt(pycurl.SSL_VERIFYPEER, 0)

    if request_type.lower() == 'post' and post_data:
        c.setopt(pycurl.HTTPPOST, post_data.items())

    # Если передан прокси, то работаем через него
    if proxy:
        # CURL proxytype
        if PROXY_TYPE == 'socks5':
            proxy_type = pycurl.PROXYTYPE_SOCKS5
        elif PROXY_TYPE == 'socks4':
            proxy_type = pycurl.PROXYTYPE_SOCKS4
        elif PROXY_TYPE == 'http':
            proxy_type = pycurl.PROXYTYPE_HTTP

        # если не можем отделить ip и порт, то возвращаем ошибку
        try:
            proxy_ip, port = proxy.split(':')
            port = int(port)
        except ValueError:
            logging.error("Возможно, неверный формат прокси: %s", str(proxy))
            raise DeadProxy(proxy_ip, port)

        c.setopt(pycurl.PROXY, proxy_ip)
        c.setopt(pycurl.PROXYPORT, port)
        c.setopt(pycurl.PROXYTYPE, proxy_type)

    # Обработка исключений при загрузке страницы
    try:
        c.perform()
    except pycurl.error as err:
        """
        CURLE_HTTP_RETURNED_ERROR (22)
        This is returned if CURLOPT_FAILONERROR is set TRUE and the HTTP
        server returns an error code that is >= 400.
        """
        if err[0] == 22:
            raise WrongCode(c.getinfo(pycurl.RESPONSE_CODE))
        """
        Если используем прокси, то все ошибки, кроме неверного кода ответа
        спихиваем на него
        """
        if proxy:
            raise DeadProxy(proxy_ip, port)
        else:
            raise pycurl.error(str(err))

    # словарь
    got_headers = __get_headers(got_headers.getvalue())

    result = {
        'headers': got_headers,
        'body': body.getvalue(),
        'current_proxy': proxy,
        'useragent': useragent,
        'referer': referer,
        'sent_headers': all_headers,
        'cookies': __get_cookies(got_headers['Set-Cookie'], cookies),
        'connect_time': c.getinfo(pycurl.CONNECT_TIME),
        'response_code': c.getinfo(pycurl.RESPONSE_CODE),
        'current_url': c.getinfo(pycurl.EFFECTIVE_URL),
        'redirect_url': c.getinfo(pycurl.REDIRECT_URL),
        'redirect_count': redirect_count,
        'headers_only': headers_only
    }
    c.close()
    del c

    return result
Exemple #29
0
 def perform(_):
     raise pycurl.error('Test Exception')
Exemple #30
0
    def download_file(
        self,
        file_key: str,
        file_vars: Optional[Dict[str, str]] = None,
        file_path: Optional[pathlib.Path] = None,
        create_dirs: bool = True,
        **path_args: Any,
    ) -> Optional[pathlib.Path]:
        """Download a file from the web and save it to disk

        Use pycurl (libcurl) to do the actual downloading. Requests might be
        nicer for this, but turned out to be much slower (and in practice
        unusable for bigger files) and also not really supporting
        ftp-downloads.

        Args:
            file_key:     File key that should be downloaded.
            file_vars:    File variables used to find path from file_key.
            file_path:    Path where file will be saved, default is to read from configuration.
            create_dirs:  Create directories as necessary before downloading file.
            path_args:    Arguments passed on to .path() to find file_path.

        Returns:
            Path to downloaded file, None if no file was downloaded.
        """
        # Do not download anything if download_missing class variable is False
        if not self.download_missing:
            return None

        # Do not download anything if url is not given in configuration
        if "url" not in self[file_key] or not self[file_key].url.str:
            return None

        # Get file_path from configuration if it's not given explicitly
        file_url = self.url(file_key, file_vars=file_vars, **path_args)
        is_zipped = self.is_path_zipped(file_url)
        path_args.update(is_zipped=is_zipped)

        if file_path is None:
            file_path = self.path(file_key, file_vars=file_vars, download_missing=False, **path_args)
        file_path = file_path.with_name(file_url.name)

        if create_dirs:
            file_path.parent.mkdir(parents=True, exist_ok=True)

        log.info(f"Download {file_key} from '{file_url}' to '{file_path}'")
        with builtins.open(file_path, mode="wb") as fid:
            c = pycurl.Curl()
            c.setopt(c.URL, file_url)
            c.setopt(c.WRITEDATA, fid)
            try:
                c.perform()
                if not (200 <= c.getinfo(c.HTTP_CODE) <= 299):
                    raise pycurl.error()
            except pycurl.error:
                log.error(f"Problem downloading file: {c.getinfo(c.EFFECTIVE_URL)} ({c.getinfo(c.HTTP_CODE)})")
                if file_path.exists():  # Print first 10 lines to console
                    head_of_file = f"Contents of '{file_path}':\n" + "\n".join(file_path.read_text().split("\n")[:10])
                    log.info(console.indent(head_of_file, num_spaces=8))
                    file_path.unlink()
                log.warn(f"Try to download '{file_url}' manually and save it at '{file_path}'")
            else:
                log.info(f"Done downloading {file_key}")
            finally:
                c.close()
        return file_path
Exemple #31
0
def mock_curl_not_http_error(mock_curl):
    mock_curl.perform.side_effect = pycurl.error()
    return mock_curl
Exemple #32
0
 def _check_curl_errors(self):
     for f in self.curlmulti.info_read()[2]:
         raise pycurl.error(*f[1:])
Exemple #33
0
    def _download(self, chunks, resume):
        if not resume:
            self.info.clear()
            self.info.addChunk("%s.chunk0" % self.filename,
                               (0, 0))  #create an initial entry

        self.chunks = []

        init = HTTPChunk(
            0, self, None,
            resume)  #initial chunk that will load complete file (if needed)

        self.chunks.append(init)
        self.m.add_handle(init.getHandle())

        lastFinishCheck = 0
        lastTimeCheck = 0
        chunksDone = set()  # list of curl handles that are finished
        chunksCreated = False
        done = False
        if self.info.getCount(
        ) > 1:  # This is a resume, if we were chunked originally assume still can
            self.chunkSupport = True

        while 1:
            #need to create chunks
            if not chunksCreated and self.chunkSupport and self.size:  #will be setted later by first chunk

                if not resume:
                    self.info.setSize(self.size)
                    self.info.createChunks(chunks)
                    self.info.save()

                chunks = self.info.getCount()

                init.setRange(self.info.getChunkRange(0))

                for i in range(1, chunks):
                    c = HTTPChunk(i, self, self.info.getChunkRange(i), resume)

                    handle = c.getHandle()
                    if handle:
                        self.chunks.append(c)
                        self.m.add_handle(handle)
                    else:
                        #close immediatly
                        self.log.debug("Invalid curl handle -> closed")
                        c.close()

                chunksCreated = True

            while 1:
                ret, num_handles = self.m.perform()
                if ret != pycurl.E_CALL_MULTI_PERFORM:
                    break

            t = time()

            # reduce these calls
            while lastFinishCheck + 0.5 < t:
                # list of failed curl handles
                failed = []
                ex = None  # save only last exception, we can only raise one anyway

                num_q, ok_list, err_list = self.m.info_read()
                for c in ok_list:
                    chunk = self.findChunk(c)
                    try:  # check if the header implies success, else add it to failed list
                        chunk.verifyHeader()
                    except BadHeader, e:
                        self.log.debug("Chunk %d failed: %s" %
                                       (chunk.id + 1, str(e)))
                        failed.append(chunk)
                        ex = e
                    else:
                        chunksDone.add(c)

                for c in err_list:
                    curl, errno, msg = c
                    chunk = self.findChunk(curl)
                    #test if chunk was finished
                    if errno != 23 or "0 !=" not in msg:
                        failed.append(chunk)
                        ex = pycurl.error(errno, msg)
                        self.log.debug("Chunk %d failed: %s" %
                                       (chunk.id + 1, str(ex)))
                        continue

                    try:  # check if the header implies success, else add it to failed list
                        chunk.verifyHeader()
                    except BadHeader, e:
                        self.log.debug("Chunk %d failed: %s" %
                                       (chunk.id + 1, str(e)))
                        failed.append(chunk)
                        ex = e
                    else:
                        chunksDone.add(curl)
                if not num_q:  # no more infos to get

                    # check if init is not finished so we reset download connections
                    # note that other chunks are closed and downloaded with init too
                    if failed and init not in failed and init.c not in chunksDone:
                        self.log.error(
                            _("Download chunks failed, fallback to single connection | %s"
                              % (str(ex))))

                        #list of chunks to clean and remove
                        to_clean = filter(lambda x: x is not init, self.chunks)
                        for chunk in to_clean:
                            self.closeChunk(chunk)
                            self.chunks.remove(chunk)
                            remove(fs_encode(self.info.getChunkName(chunk.id)))

                        #let first chunk load the rest and update the info file
                        init.resetRange()
                        self.info.clear()
                        self.info.addChunk("%s.chunk0" % self.filename,
                                           (0, self.size))
                        self.info.save()
                    elif failed:
                        raise ex

                    lastFinishCheck = t

                    if len(chunksDone) >= len(self.chunks):
                        if len(chunksDone) > len(self.chunks):
                            self.log.warning(
                                "Finished download chunks size incorrect, please report bug."
                            )
                        done = True  #all chunks loaded

                    break
Exemple #34
0
    def iterate_results(self):
        while True:
            try:
                self.network_op_lock.acquire()
                with self.sigint_handler.handle_sigint():
                    queued_messages, ok_list, fail_list = (
                        self.multi.info_read())
            finally:
                self.network_op_lock.release()
            #except Exception as ex:
            #    # Usually that should not happen
            #    logging.error('', exc_info=ex)
            #    continue

            results = []
            for curl in ok_list:
                results.append((True, curl, None, None, None))
            for curl, ecode, emsg in fail_list:
                curl.grab_callback_interrupted = False
                try:
                    raise pycurl.error(ecode, emsg)
                except Exception as exc:  # pylint: disable=broad-except
                    grab_exc = build_grab_exception(exc, curl)
                # grab_exc could be None if the pycurl error
                # was expected (could be in case of
                # body_maxsize and other options)
                if grab_exc:
                    results.append((False, curl, ecode, emsg, grab_exc))
                else:
                    results.append((True, curl, None, None, None))

            for is_ok, curl, ecode, emsg, grab_exc in results:
                # FORMAT: {is_ok, grab, grab_config_backup, task,
                #          ecode, emsg, error_abbr, exc}

                curl_id = id(curl)
                task = self.registry[curl_id]['task']
                grab = self.registry[curl_id]['grab']
                grab_config_backup =\
                    self.registry[curl_id]['grab_config_backup']

                try:
                    self.network_op_lock.acquire()
                    grab.process_request_result()
                except GrabTooManyRedirectsError:
                    ecode = ERROR_TOO_MANY_REDIRECTS
                    emsg = 'Too many meta refresh redirects'
                    is_ok = False
                finally:
                    self.network_op_lock.release()
                #except Exception as ex:
                #    logging.error('', exc_info=ex)
                #    ecode = ERROR_INTERNAL_GRAB_ERROR
                #    emsg = 'Internal grab error'
                #    is_ok = False

                grab.doc.error_code = ecode
                grab.doc.error_msg = emsg
                grab.exception = grab_exc

                # Free resources
                del self.registry[curl_id]
                grab.transport.curl = None

                if is_ok:
                    error_abbr = None
                else:
                    error_abbr = ERRNUM_TAG.get(ecode, 'unknown-%d' % ecode)
                yield {
                    'ok': is_ok,
                    'ecode': ecode,
                    'emsg': emsg,
                    'error_abbr': error_abbr,
                    'exc': grab_exc,
                    'grab': grab,
                    'grab_config_backup': grab_config_backup,
                }, task

                try:
                    self.network_op_lock.acquire()
                    with self.sigint_handler.handle_sigint():
                        self.multi.remove_handle(curl)
                finally:
                    self.network_op_lock.release()

                curl.reset()
                self.freelist.append(curl)

            if not queued_messages:
                break
Exemple #35
0
 def _check_curl_errors(self):
     for f in self.curl_multi.info_read()[2]:
         raise pycurl.error(*f[1:])
Exemple #36
0
    def _download(self, chunks, resume):
        if not resume:
            self.info.clear()
            # Create an initial entry
            self.info.addChunk("%s.chunk0" % self.filename, (0, 0))

        self.chunks = []

        # Initial chunk that will load complete file (if needed)
        init = HTTPChunk(0, self, None, resume)

        self.chunks.append(init)
        self.m.add_handle(init.getHandle())

        lastFinishCheck = 0
        lastTimeCheck = 0
        # List of curl handles that are finished
        chunksDone = set()
        chunksCreated = False
        done = False

        # This is a resume, if we were chunked originally assume still can
        if self.info.getCount() > 1:
            self.chunkSupport = True

        while 1:
            # Need to create chunks
            # Will be set later by first chunk
            if not chunksCreated and self.chunkSupport and self.size:
                if not resume:
                    self.info.setSize(self.size)
                    self.info.createChunks(chunks)
                    self.info.save()

                chunks = self.info.getCount()

                init.setRange(self.info.getChunkRange(0))

                for i in range(1, chunks):
                    c = HTTPChunk(i, self, self.info.getChunkRange(i), resume)

                    handle = c.getHandle()
                    if handle:
                        self.chunks.append(c)
                        self.m.add_handle(handle)
                    else:
                        # Close immediatly
                        self.log.debug("Invalid curl handle -> closed")
                        c.close()

                chunksCreated = True

            while 1:
                ret, num_handles = self.m.perform()
                if ret != pycurl.E_CALL_MULTI_PERFORM:
                    break

            t = time()

            # Reduce these calls
            while lastFinishCheck + 0.5 < t:
                # List of failed curl handles
                failed = []
                # Save only last exception, we can only raise one anyway
                ex = None

                num_q, ok_list, err_list = self.m.info_read()
                for c in ok_list:
                    chunk = self.findChunk(c)
                    # Check if the header implies success, else add it to failed list
                    try:
                        chunk.verifyHeader()
                    except BadHeader as e:
                        self.log.debug("Chunk %d failed: %s" %
                                       (chunk.id + 1, str(e)))
                        failed.append(chunk)
                        ex = e
                    else:
                        chunksDone.add(c)

                for c in err_list:
                    curl, errno, msg = c
                    chunk = self.findChunk(curl)
                    # Test if chunk was finished
                    if errno != 23 or "0 !=" not in msg:
                        failed.append(chunk)
                        ex = pycurl.error(errno, msg)
                        self.log.debug("Chunk %d failed: %s" %
                                       (chunk.id + 1, str(ex)))
                        continue
                    # Check if the header implies success, else add it to failed list
                    try:
                        chunk.verifyHeader()
                    except BadHeader as e:
                        self.log.debug("Chunk %d failed: %s" %
                                       (chunk.id + 1, str(e)))
                        failed.append(chunk)
                        ex = e
                    else:
                        chunksDone.add(curl)
                        # No more infos to get
                if not num_q:

                    # Check if init is not finished so we reset download connections,
                    # note that other chunks are closed and downloaded with init too
                    if failed and init not in failed and init.c not in chunksDone:
                        self.log.error(
                            _("Download chunks failed, fallback to single connection | %s"
                              % (str(ex))))

                        # List of chunks to clean and remove
                        for chunk in filter(lambda x: x is not init,
                                            self.chunks):
                            self.closeChunk(chunk)
                            self.chunks.remove(chunk)
                            remove(fs_encode(self.info.getChunkName(chunk.id)))

                        # Let first chunk load the rest and update the info file
                        init.resetRange()
                        self.info.clear()
                        self.info.addChunk("%s.chunk0" % self.filename,
                                           (0, self.size))
                        self.info.save()
                    elif failed:
                        raise ex

                    lastFinishCheck = t

                    if len(chunksDone) >= len(self.chunks):
                        if len(chunksDone) > len(self.chunks):
                            self.log.warning(
                                "Finished download chunks size incorrect, please report bug."
                            )
                        # All chunks loaded
                        done = True

                    break
            # All chunks loaded
            if done:
                break

            # Calc speed once per second, averaging over 3 seconds
            if lastTimeCheck + 1 < t:
                diff = [
                    c.arrived -
                    (self.lastArrived[i] if len(self.lastArrived) > i else 0)
                    for i, c in enumerate(self.chunks)
                ]

                self.lastSpeeds[1] = self.lastSpeeds[0]
                self.lastSpeeds[0] = self.speeds
                self.speeds = [float(a) / (t - lastTimeCheck) for a in diff]
                self.lastArrived = [c.arrived for c in self.chunks]
                lastTimeCheck = t
                self.updateProgress()

            if self.abort:
                raise Abort()

            # Sleep(0.003) #supress busy waiting - limits dl speed to  (1 / x) * buffersize
            self.m.select(1)

        for chunk in self.chunks:
            # Make sure downloads are written to disk
            chunk.flushFile()

        self._copyChunks()
Exemple #37
0
    def _download(self, chunks, resume):
        if not resume:
            self.info.clear()
            self.info.addChunk("%s.chunk0" % self.filename, (0, 0)) #create an initial entry

        self.chunks = []

        init = HTTPChunk(0, self, None, resume) #initial chunk that will load complete file (if needed)

        self.chunks.append(init)
        self.m.add_handle(init.getHandle())

        lastFinishCheck = 0
        lastTimeCheck = 0
        chunksDone = set()  # list of curl handles that are finished
        chunksCreated = False
        done = False
        if self.info.getCount() > 1: # This is a resume, if we were chunked originally assume still can
            self.chunkSupport = True

        while 1:
            #need to create chunks
            if not chunksCreated and self.chunkSupport and self.size: #will be setted later by first chunk

                if not resume:
                    self.info.setSize(self.size)
                    self.info.createChunks(chunks)
                    self.info.save()

                chunks = self.info.getCount()

                init.setRange(self.info.getChunkRange(0))

                for i in range(1, chunks):
                    c = HTTPChunk(i, self, self.info.getChunkRange(i), resume)

                    handle = c.getHandle()
                    if handle:
                        self.chunks.append(c)
                        self.m.add_handle(handle)
                    else:
                        #close immediatly
                        self.log.debug("Invalid curl handle -> closed")
                        c.close()

                chunksCreated = True

            while 1:
                ret, num_handles = self.m.perform()
                if ret != pycurl.E_CALL_MULTI_PERFORM:
                    break

            t = time()

            # reduce these calls
            while lastFinishCheck + 0.5 < t:
                # list of failed curl handles
                failed = []
                ex = None # save only last exception, we can only raise one anyway

                num_q, ok_list, err_list = self.m.info_read()
                for c in ok_list:
                    chunk = self.findChunk(c)
                    try: # check if the header implies success, else add it to failed list
                        chunk.verifyHeader()
                    except BadHeader, e:
                        self.log.debug("Chunk %d failed: %s" % (chunk.id + 1, str(e)))
                        failed.append(chunk)
                        ex = e
                    else:
                        chunksDone.add(c)

                for c in err_list:
                    curl, errno, msg = c
                    chunk = self.findChunk(curl)
                    #test if chunk was finished
                    if errno != 23 or "0 !=" not in msg:
                        failed.append(chunk)
                        ex = pycurl.error(errno, msg)
                        self.log.debug("Chunk %d failed: %s" % (chunk.id + 1, str(ex)))
                        continue

                    try: # check if the header implies success, else add it to failed list
                        chunk.verifyHeader()
                    except BadHeader, e:
                        self.log.debug("Chunk %d failed: %s" % (chunk.id + 1, str(e)))
                        failed.append(chunk)
                        ex = e
                    else:
                        chunksDone.add(curl)
                if not num_q: # no more infos to get

                    # check if init is not finished so we reset download connections
                    # note that other chunks are closed and downloaded with init too
                    if failed and init not in failed and init.c not in chunksDone:
                        self.log.error(_("Download chunks failed, fallback to single connection | %s" % (str(ex))))

                        #list of chunks to clean and remove
                        to_clean = filter(lambda x: x is not init, self.chunks)
                        for chunk in to_clean:
                            self.closeChunk(chunk)
                            self.chunks.remove(chunk)
                            remove(fs_encode(self.info.getChunkName(chunk.id)))

                        #let first chunk load the rest and update the info file
                        init.resetRange()
                        self.info.clear()
                        self.info.addChunk("%s.chunk0" % self.filename, (0, self.size))
                        self.info.save()
                    elif failed:
                        raise ex

                    lastFinishCheck = t

                    if len(chunksDone) >= len(self.chunks):
                        if len(chunksDone) > len(self.chunks):
                            self.log.warning("Finished download chunks size incorrect, please report bug.")
                        done = True  #all chunks loaded

                    break
Exemple #38
0
    def execute(self, method, url, data=None, headers=[]):
        import pycurl
        host = self.get_host_port_from_url(url)
        if host in self.curl_session:
            curl = self.curl_session[host]
        else:
            self.curl_session[host] = pycurl.Curl()
            curl = self.curl_session[host]

        url = url.replace(" ", "%20")

        method = method.upper()

        self.server_headers = dict()

        buffer = BytesIO()

        curl.setopt(curl.URL, nfw.utils.if_unicode_to_utf8(url))
        try:
            curl.setopt(curl.WRITEDATA, buffer)
        except TypeError:
            curl.setopt(curl.WRITEFUNCTION, buffer.write)
        curl.setopt(curl.HEADERFUNCTION, self.header_function)
        curl.setopt(curl.FOLLOWLOCATION, True)
        curl.setopt(curl.SSL_VERIFYPEER, self.ssl_verify_peer)
        curl.setopt(curl.SSL_VERIFYHOST, self.ssl_verify_host)
        curl.setopt(curl.CONNECTTIMEOUT, self.connect_timeout)
        curl.setopt(curl.TIMEOUT, self.timeout)
        curl.setopt(curl.DEBUGFUNCTION, _debug)
        curl.setopt(curl.VERBOSE, 1)

        if data is not None:
            curl.setopt(curl.POSTFIELDS, nfw.utils.if_unicode_to_utf8(data))
        else:
            curl.setopt(curl.POSTFIELDS, nfw.utils.if_unicode_to_utf8(''))

        send_headers = list()
        for header in headers:
            send_header = nfw.utils.if_unicode_to_utf8(
                "%s: %s" % (header, headers[header]))
            send_headers.append(send_header)

        curl.setopt(pycurl.HTTPHEADER, send_headers)

        if method == nfw.HTTP_GET:
            curl.setopt(curl.CUSTOMREQUEST,
                        nfw.utils.if_unicode_to_utf8('GET'))
        elif method == nfw.HTTP_PUT:
            curl.setopt(curl.CUSTOMREQUEST,
                        nfw.utils.if_unicode_to_utf8('PUT'))
        elif method == nfw.HTTP_POST:
            curl.setopt(curl.CUSTOMREQUEST,
                        nfw.utils.if_unicode_to_utf8('POST'))
        elif method == nfw.HTTP_PATCH:
            curl.setopt(curl.CUSTOMREQUEST,
                        nfw.utils.if_unicode_to_utf8('PATCH'))
        elif method == nfw.HTTP_DELETE:
            curl.setopt(curl.CUSTOMREQUEST,
                        nfw.utils.if_unicode_to_utf8('DELETE'))
        elif method == nfw.HTTP_OPTIONS:
            curl.setopt(curl.CUSTOMREQUEST,
                        nfw.utils.if_unicode_to_utf8('OPTIONS'))
        elif method == nfw.HTTP_HEAD:
            curl.setopt(curl.CUSTOMREQUEST,
                        nfw.utils.if_unicode_to_utf8('HEAD'))
        elif method == nfw.HTTP_TRACE:
            curl.setopt(curl.CUSTOMREQUEST,
                        nfw.utils.if_unicode_to_utf8('TRACE'))
        elif method == nfw.HTTP_CONNECT:
            curl.setopt(curl.CUSTOMREQUEST,
                        nfw.utils.if_unicode_to_utf8('CONNECT'))
        else:
            raise nfw.Error("Invalid request type %s" % (method, ))

        try:
            curl.perform()
            status = curl.getinfo(pycurl.HTTP_CODE)
        except pycurl.error as e:
            del self.curl_session[host]
            if e[0] == 28:
                raise nfw.RestClientError("Connection timeout %s" % (host, ))
            else:
                raise pycurl.error(e)

        # Figure out what encoding was sent with the response, if any.
        # Check against lowercased header name.
        encoding = None
        if 'content-type' in self.server_headers:
            content_type = self.server_headers['content-type'].lower()
            match = re.search('charset=(\S+)', content_type)
            if match:
                encoding = match.group(1)
        if encoding is None:
            # Default encoding for JSON is UTF-8.
            # Other content types may have different default encoding,
            # or in case of binary data, may have no encoding at all.
            encoding = 'utf_8'

        body = buffer.getvalue()
        # Decode using the encoding we figured out.
        body = body.decode(encoding)
        resp_header = nfw.Headers()
        for h in self.server_headers:
            resp_header[h] = self.server_headers[h]
        return (status, resp_header, body)