Ejemplo n.º 1
0
    def _process_redirect(self):
        '''Update the Redirect Tracker.'''
        _logger.debug('Handling redirect.')

        if self._redirect_tracker.exceeded():
            raise ProtocolError('Too many redirects.')

        try:
            url = self._redirect_tracker.next_location()

            if not url:
                raise ProtocolError('Redirect location missing.')

            if self._redirect_tracker.is_repeat():
                _logger.debug('Got redirect is repeat.')

                request = self._original_request.copy()
                request.url = url
            else:
                request = self._web_client.request_factory(url)

            request.prepare_for_send()
        except ValueError as error:
            raise ProtocolError('Invalid redirect location.') from error

        self._next_request = request

        _logger.debug('Updated next redirect request to {0}.'.format(request))
Ejemplo n.º 2
0
    def read_chunk_header(self):
        '''Read a single chunk's header.

        Returns:
            tuple: 2-item tuple with the size of the content in the chunk and
            the raw header byte string.

        Coroutine.
        '''
        # _logger.debug('Reading chunk.')

        try:
            chunk_size_hex = yield from self._connection.readline()
        except ValueError as error:
            raise ProtocolError(
                'Invalid chunk size: {0}'.format(error)) from error

        if not chunk_size_hex.endswith(b'\n'):
            raise NetworkError('Connection closed.')

        try:
            chunk_size = int(chunk_size_hex.split(b';', 1)[0].strip(), 16)
        except ValueError as error:
            raise ProtocolError(
                'Invalid chunk size: {0}'.format(error)) from error

        if chunk_size < 0:
            raise ProtocolError('Chunk size cannot be negative.')

        self._chunk_size = self._bytes_left = chunk_size

        return chunk_size, chunk_size_hex
Ejemplo n.º 3
0
    def _update_redirect_request(self):
        '''Update the Redirect Tracker.'''
        _logger.debug('Handling redirect.')

        if self._redirect_tracker.exceeded():
            raise ProtocolError('Too many redirects.')

        url = self._redirect_tracker.next_location()

        if not url:
            raise ProtocolError('Redirect location missing.')

        try:
            request = self._rich_client.request_factory(url)
        except ValueError as error:
            raise ProtocolError('Invalid redirect location.') from error

        if self._redirect_tracker.is_repeat():
            _logger.debug('Got redirect is repeat.')

            request.method = self._original_request.method
            request.body = self._original_request.body

            for name, value in self._original_request.fields.items():
                if name not in request.fields:
                    request.fields.add(name, value)

        self._next_request = request

        _logger.debug('Updated next redirect request to {0}.'.format(request))
Ejemplo n.º 4
0
 def _decompress_data(self, data):
     '''Decompress the given data and return the uncompressed data.'''
     if self._decompressor:
         try:
             return self._decompressor.decompress(data)
         except zlib.error as error:
             raise ProtocolError(
                 'zlib error: {0}.'.format(error)) from error
     else:
         return data
Ejemplo n.º 5
0
 def _flush_decompressor(self):
     '''Return any data left in the decompressor.'''
     if self._decompressor:
         try:
             return self._decompressor.flush()
         except zlib.error as error:
             raise ProtocolError(
                 'zlib flush error: {0}.'.format(error)) from error
     else:
         return b''
Ejemplo n.º 6
0
    def read_chunk(self):
        '''Read a single chunk of the chunked transfer encoding.

        Returns:
            int: The size of the content in the chunk.
        '''
        _logger.debug('Reading chunk.')
        chunk_size_hex = yield self._io_stream.read_until(b'\n')

        self.data_event.fire(chunk_size_hex)

        try:
            chunk_size = int(chunk_size_hex.split(b';', 1)[0].strip(), 16)
        except ValueError as error:
            raise ProtocolError(error.args[0]) from error

        _logger.debug('Getting chunk size={0}.'.format(chunk_size))

        if not chunk_size:
            raise tornado.gen.Return(chunk_size)

        data_queue = self._io_stream.read_bytes_queue(chunk_size)

        while True:
            data = yield data_queue.get()

            if data is None:
                break

            self.data_event.fire(data)
            self.content_event.fire(data)

        newline_data = yield self._io_stream.read_until(b'\n')

        self.data_event.fire(newline_data)

        if len(newline_data) > 2:
            # Should be either CRLF or LF
            # This could our problem or the server's problem
            raise ProtocolError('Error reading newline after chunk.')

        raise tornado.gen.Return(chunk_size)
Ejemplo n.º 7
0
    def parse(self, data):
        assert self.name is None
        assert not self.argument

        match = re.match(br'(\w+) ?([^\r\n]*)', data)

        if not match:
            raise ProtocolError('Failed to parse command.')

        self.name = match.group(1).decode('utf-8', errors='surrogateescape')
        self.argument = match.group(2).decode('utf-8', errors='surrogateescape')
Ejemplo n.º 8
0
    def read_listing_content(self, file, duration_timeout=None):
        '''Read file listings.

        Returns:
            .ftp.request.ListingResponse: A Response populated the
            file listings

        Be sure to call :meth:`fetch_file_listing` first.

        Coroutine.
        '''
        yield From(
            self.read_content(file=file,
                              rewind=False,
                              duration_timeout=duration_timeout))

        try:
            if self._response.body.tell() == 0:
                listings = ()
            elif self._listing_type == 'mlsd':
                self._response.body.seek(0)

                machine_listings = wpull.ftp.util.parse_machine_listing(
                    self._response.body.read().decode(
                        'utf-8', errors='surrogateescape'),
                    convert=True,
                    strict=False)
                listings = list(
                    wpull.ftp.util.machine_listings_to_file_entries(
                        machine_listings))
            else:
                self._response.body.seek(0)

                file = io.TextIOWrapper(self._response.body,
                                        encoding='utf-8',
                                        errors='surrogateescape')

                listing_parser = ListingParser(file=file)
                heuristics_result = listing_parser.run_heuristics()

                _logger.debug('Listing detected as %s', heuristics_result)

                listings = listing_parser.parse()

                # We don't want the file to be closed when exiting this function
                file.detach()

        except (ListingError, ValueError) as error:
            raise ProtocolError(*error.args) from error

        self._response.files = listings

        self._response.body.seek(0)
        raise Return(self._response)
Ejemplo n.º 9
0
    def read_response(self, response=None):
        '''Read the response's HTTP status line and header fields.

        Coroutine.
        '''
        _logger.debug('Reading header.')

        if response is None:
            response = Response()

        header_lines = []
        bytes_read = 0

        while True:
            try:
                data = yield From(self._connection.readline())
            except ValueError as error:
                raise ProtocolError(
                    'Invalid header: {0}'.format(error)) from error

            self._data_observer.notify('response', data)

            if not data.endswith(b'\n'):
                raise NetworkError('Connection closed.')
            elif data in (b'\r\n', b'\n'):
                break

            header_lines.append(data)
            assert data.endswith(b'\n')

            bytes_read += len(data)

            if bytes_read > 32768:
                raise ProtocolError('Header too big.')

        if not header_lines:
            raise ProtocolError('No header received.')

        response.parse(b''.join(header_lines))

        raise Return(response)
Ejemplo n.º 10
0
    def read_chunk_body(self):
        '''Read a fragment of a single chunk.

        Call :meth:`read_chunk_header` first.

        Returns:
            tuple: 2-item tuple with the content data and raw data.
            First item is empty bytes string when chunk is fully read.

        Coroutine.
        '''
        # chunk_size = self._chunk_size
        bytes_left = self._bytes_left

        # _logger.debug(__('Getting chunk size={0}, remain={1}.',
        #                 chunk_size, bytes_left))

        if bytes_left > 0:
            size = min(bytes_left, self._read_size)
            data = yield from self._connection.read(size)

            self._bytes_left -= len(data)

            return (data, data)
        elif bytes_left < 0:
            raise ProtocolError('Chunked-transfer overrun.')
        elif bytes_left:
            raise NetworkError('Connection closed.')

        newline_data = yield from self._connection.readline()

        if len(newline_data) > 2:
            # Should be either CRLF or LF
            # This could our problem or the server's problem
            raise ProtocolError('Error reading newline after chunk.')

        self._chunk_size = self._bytes_left = None

        return (b'', newline_data)
Ejemplo n.º 11
0
        def response_callback(request):
            request.prepare_for_send()
            self.assertTrue(request.url_info.url.endswith('robots.txt'))
            response = Response(302, 'See else')
            response.request = request
            response.fields['Location'] = '/robots.txt'

            nonlocal_dict['counter'] += 1

            if nonlocal_dict['counter'] > 20:
                raise ProtocolError('Mock redirect loop error.')

            return response
Ejemplo n.º 12
0
    def parse_status_line(cls, string):
        '''Parse the status line bytes.

        Returns:
            tuple: An tuple representing the version, code, and reason.
        '''
        match = re.match(br'(HTTP/1\.[01])[ \t]+([0-9]{1,3})[ \t]*([^\r\n]*)',
                         string)
        if match:
            groups = match.groups()
            if len(groups) == 3:
                return wpull.string.to_str(
                    (groups[0], int(groups[1]), groups[2]),
                    encoding='latin-1',
                )

        raise ProtocolError("Error parsing status line '{0}'".format(string))
Ejemplo n.º 13
0
    def parse(self, data):
        for line in data.splitlines(False):
            match = re.match(br'(\d{3}|^)([ -]?)(.*)', line)

            if not match:
                raise ProtocolError('Failed to parse reply.')

            if match.group(1) and match.group(2) == b' ':
                assert self.code is None
                self.code = int(match.group(1))

            if self.text is None:
                self.text = match.group(3).decode('utf-8',
                                                  errors='surrogateescape')
            else:
                self.text += '\r\n{0}'.format(match.group(3).decode(
                    'utf-8', errors='surrogateescape'))
Ejemplo n.º 14
0
    def parse_status_line(cls, data):
        '''Parse the status line bytes.

        Returns:
            tuple: An tuple representing the version, code, and reason.
        '''
        match = re.match(br'(HTTP/\d+\.\d+)[ \t]+([0-9]{1,3})[ \t]*([^\r\n]*)',
                         data)
        if match:
            groups = match.groups()
            if len(groups) == 3:
                return wpull.string.to_str(
                    (groups[0], int(groups[1]), groups[2]),
                    encoding='latin-1',
                )

        raise ProtocolError(
            'Error parsing status line {line}".'.format(line=ascii(data)))
Ejemplo n.º 15
0
    def parse_status_line(cls, string):
        '''Parse the status line bytes.

        Returns:
            tuple: An tuple representing the method, resource path, and
            version.
        '''
        match = re.match(br'([a-zA-Z]+)[ \t]+([^ \t]+)[ \t]+(HTTP/1\.[01])',
                         string)
        if match:
            groups = match.groups()
            if len(groups) == 3:
                return wpull.string.to_str(
                    (groups[0], groups[1], groups[2]),
                    encoding='latin-1',
                )

        raise ProtocolError('Error parsing status line ‘{0}’'.format(string))
Ejemplo n.º 16
0
    def parse_status_line(self, data):
        '''Parse the status line bytes.

        Returns:
            tuple: An tuple representing the method, URI, and
            version.
        '''
        match = re.match(br'([a-zA-Z]+)[ \t]+([^ \t]+)[ \t]+(HTTP/\d+\.\d+)',
                         data)
        if match:
            groups = match.groups()
            if len(groups) == 3:
                return wpull.string.to_str(
                    (groups[0], groups[1], groups[2]),
                    encoding=self.encoding,
                )

        raise ProtocolError('Error parsing status line.')
Ejemplo n.º 17
0
    def passive_mode(self):
        '''Enable passive mode.

        Returns:
            tuple: The address (IP address, port) of the passive port.

        Coroutine.
        '''
        yield From(self._control_stream.write_command(Command('PASV')))

        reply = yield From(self._control_stream.read_reply())

        self.raise_if_not_match('Passive mode',
                                ReplyCodes.entering_passive_mode, reply)

        try:
            raise Return(wpull.ftp.util.parse_address(reply.text))
        except ValueError as error:
            raise ProtocolError(str(error)) from error
Ejemplo n.º 18
0
    def _process_request(self, request, response_factory):
        '''Fulfill a single request.

        Returns:
            Response
        '''
        yield self._connect()

        request.address = self._resolved_address
        self._events.pre_request(request)

        if sys.version_info < (3, 3):
            error_class = (socket.error, StreamClosedError, ssl.SSLError)
        else:
            error_class = (ConnectionError, StreamClosedError, ssl.SSLError)

        if not self._params.keep_alive and 'Connection' not in request.fields:
            request.fields['Connection'] = 'close'

        try:
            yield self._send_request_header(request)
            yield self._send_request_body(request)
            self._events.request.fire(request)

            response = yield self._read_response_header(response_factory)
            # TODO: handle 100 Continue

            yield self._read_response_body(request, response)
        except error_class as error:
            raise NetworkError('Network error: {0}'.format(error)) from error
        except BufferFullError as error:
            raise ProtocolError(*error.args) from error

        self._events.response.fire(response)

        if self.should_close(request.version,
                             response.fields.get('Connection')):
            _logger.debug('HTTP connection close.')
            self.close()
        else:
            self._io_stream.monitor_for_close()

        raise tornado.gen.Return(response)
Ejemplo n.º 19
0
    def _read_request_header(self):
        request = Request()

        for dummy in range(100):
            line = yield From(self._reader.readline())

            _logger.debug(__('Got line {0}', line))

            if line[-1:] != b'\n':
                return

            if not line.strip():
                break

            request.parse(line)
        else:
            raise ProtocolError('Request has too many headers.')

        raise Return(request)
Ejemplo n.º 20
0
    def handle_error(self, item_session: ItemSession,
                     error: BaseException) -> Actions:
        '''Process an error.

        Returns:
            A value from :class:`.hook.Actions`.
        '''
        if not self._ssl_verification and \
                isinstance(error, SSLVerificationError):
            # Change it into a different error since the user doesn't care
            # about verifying certificates
            self._statistics.increment_error(ProtocolError())
        else:
            self._statistics.increment_error(error)

        self._waiter.increment()

        action = self.consult_error_hook(item_session, error)

        if action == Actions.RETRY:
            item_session.set_status(Status.error)
        elif action == Actions.FINISH:
            item_session.set_status(Status.done)
        elif action == Actions.STOP:
            raise HookStop('Script requested immediate stop.')
        elif self._ssl_verification and isinstance(error,
                                                   SSLVerificationError):
            raise
        elif isinstance(error, ConnectionRefused) and \
                not self.retry_connrefused:
            item_session.set_status(Status.skipped)
        elif isinstance(error, DNSNotFound) and \
                not self.retry_dns_error:
            item_session.set_status(Status.skipped)
        else:
            item_session.set_status(Status.error)

        return action
Ejemplo n.º 21
0
    def _stream_closed_callback(self):
        _logger.debug('Stream closed. '
            'active={0} connected={1} ' \
            'closed={2} reading={3} writing={3}'.format(
                self._active,
                self.connected,
                self._io_stream.closed(),
                self._io_stream.reading(),
                self._io_stream.writing())
        )

        if not self._active:
            # We are likely in a context that's already dead
            _logger.debug('Ignoring stream closed error={0}.'\
                .format(self._io_stream.error))
            return

        if self._io_stream.error:
            _logger.debug('Throwing error {0}.'.format(self._io_stream.error))
            raise self._io_stream.error

        if self._io_stream.buffer_full:
            _logger.debug('Buffer full.')
            raise ProtocolError('Buffer full.')
Ejemplo n.º 22
0
    def download_listing(self, file: Optional[IO],
                         duration_timeout: Optional[float]=None) -> \
            ListingResponse:
        '''Read file listings.

        Args:
            file: A file object or asyncio stream.
            duration_timeout: Maximum time in seconds of which the
                entire file must be read.

        Returns:
            A Response populated the file listings

        Be sure to call :meth:`start_file_listing` first.

        Coroutine.
        '''
        if self._session_state != SessionState.directory_request_sent:
            raise RuntimeError('File request not sent')

        self._session_state = SessionState.file_request_sent

        yield from self.download(file=file,
                                 rewind=False,
                                 duration_timeout=duration_timeout)

        try:
            if self._response.body.tell() == 0:
                listings = ()
            elif self._listing_type == 'mlsd':
                self._response.body.seek(0)

                machine_listings = wpull.protocol.ftp.util.parse_machine_listing(
                    self._response.body.read().decode(
                        'utf-8', errors='surrogateescape'),
                    convert=True,
                    strict=False)
                listings = list(
                    wpull.protocol.ftp.util.machine_listings_to_file_entries(
                        machine_listings))
            else:
                self._response.body.seek(0)

                file = io.TextIOWrapper(self._response.body,
                                        encoding='utf-8',
                                        errors='surrogateescape')

                listing_parser = ListingParser(file=file)

                listings = list(listing_parser.parse_input())

                _logger.debug('Listing detected as %s', listing_parser.type)

                # We don't want the file to be closed when exiting this function
                file.detach()

        except (ListingError, ValueError) as error:
            raise ProtocolError(*error.args) from error

        self._response.files = listings

        self._response.body.seek(0)

        self._session_state = SessionState.response_received

        return self._response