Beispiel #1
0
def _read(io):
    """Try to read from a url, file or string.

    Parameters
    ----------
    io : str, unicode, or file-like

    Returns
    -------
    raw_text : str
    """
    if _is_url(io):
        try:
            with contextlib.closing(urllib2.urlopen(io)) as url:
                raw_text = url.read()
        except urllib2.URLError:
            raise ValueError('Invalid URL: "{0}"'.format(io))
    elif hasattr(io, 'read'):
        raw_text = io.read()
    elif os.path.isfile(io):
        with open(io) as f:
            raw_text = f.read()
    elif isinstance(io, basestring):
        raw_text = io
    else:
        raise ValueError("Cannot read object of type '{0}'".format(type(io)))
    return raw_text
Beispiel #2
0
    def _build_doc(self):
        """
        Raises
        ------
        IOError
            * If a valid URL is detected, but for some reason cannot be parsed.
              This is probably due to a faulty or non-existent internet
              connection.
        ValueError
            * If a URL that lxml cannot parse is passed.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.html import parse, fromstring

        try:
            # try to parse the input in the simplest way
            return parse(self.io)
        except (UnicodeDecodeError, IOError):
            # something went wrong, check for not-a-url because it's probably a
            # huge string blob
            if not _is_url(self.io):
                return fromstring(self.io)
            elif urlparse.urlparse(self.io).scheme not in ('http', 'ftp',
                                                           'file'):
                raise ValueError('"{0}" does not have a valid URL'
                                 ' protocol'.format(self.io))
            else:
                raise IOError('"{0}" is a valid URL, so you probably are not'
                              ' properly connected to the'
                              ' internet'.format(self.io))
Beispiel #3
0
def _read(io):
    """Try to read from a url, file or string.

    Parameters
    ----------
    io : str, unicode, or file-like

    Returns
    -------
    raw_text : str
    """
    if _is_url(io):
        try:
            with contextlib.closing(urllib2.urlopen(io)) as url:
                raw_text = url.read()
        except urllib2.URLError:
            raise ValueError('Invalid URL: "{0}"'.format(io))
    elif hasattr(io, 'read'):
        raw_text = io.read()
    elif os.path.isfile(io):
        with open(io) as f:
            raw_text = f.read()
    elif isinstance(io, basestring):
        raw_text = io
    else:
        raise ValueError("Cannot read object of type '{0}'".format(type(io)))
    return raw_text
Beispiel #4
0
    def _build_doc(self):
        """
        Raises
        ------
        IOError
            * If a valid URL is detected, but for some reason cannot be parsed.
              This is probably due to a faulty or non-existent internet
              connection.
        ValueError
            * If a URL that lxml cannot parse is passed.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.html import parse, fromstring

        try:
            # try to parse the input in the simplest way
            return parse(self.io)
        except (UnicodeDecodeError, IOError):
            # something went wrong, check for not-a-url because it's probably a
            # huge string blob
            if not _is_url(self.io):
                return fromstring(self.io)
            elif urlparse.urlparse(self.io).scheme not in ('http', 'ftp',
                                                           'file'):
                raise ValueError('"{0}" does not have a valid URL'
                                 ' protocol'.format(self.io))
            else:
                raise IOError('"{0}" is a valid URL, so you probably are not'
                              ' properly connected to the'
                              ' internet'.format(self.io))
Beispiel #5
0
    def _build_doc(self):
        """
        Raises
        ------
        ValueError
            * If a URL that lxml cannot parse is passed.

        Exception
            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.html import parse, fromstring, HTMLParser
        from lxml.etree import XMLSyntaxError
        parser = HTMLParser(recover=False)

        try:
            # try to parse the input in the simplest way
            r = parse(self.io, parser=parser)

            try:
                r = r.getroot()
            except AttributeError:
                pass
        except (UnicodeDecodeError, IOError):
            # if the input is a blob of html goop
            if not _is_url(self.io):
                r = fromstring(self.io, parser=parser)

                try:
                    r = r.getroot()
                except AttributeError:
                    pass
            else:
                # not a url
                scheme = urlparse.urlparse(self.io).scheme
                if scheme not in _valid_schemes:
                    # lxml can't parse it
                    msg = ('{0} is not a valid url scheme, valid schemes are '
                           '{1}').format(scheme, _valid_schemes)
                    raise ValueError(msg)
                else:
                    # something else happened: maybe a faulty connection
                    raise
        else:
            if not hasattr(r, 'text_content'):
                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
        return r
Beispiel #6
0
    def _build_doc(self):
        """
        Raises
        ------
        ValueError
            * If a URL that lxml cannot parse is passed.

        Exception
            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.html import parse, fromstring, HTMLParser
        from lxml.etree import XMLSyntaxError
        parser = HTMLParser(recover=False)

        try:
            # try to parse the input in the simplest way
            r = parse(self.io, parser=parser)

            try:
                r = r.getroot()
            except AttributeError:
                pass
        except (UnicodeDecodeError, IOError):
            # if the input is a blob of html goop
            if not _is_url(self.io):
                r = fromstring(self.io, parser=parser)

                try:
                    r = r.getroot()
                except AttributeError:
                    pass
            else:
                # not a url
                scheme = urlparse.urlparse(self.io).scheme
                if scheme not in _valid_schemes:
                    # lxml can't parse it
                    msg = ('{0} is not a valid url scheme, valid schemes are '
                           '{1}').format(scheme, _valid_schemes)
                    raise ValueError(msg)
                else:
                    # something else happened: maybe a faulty connection
                    raise
        else:
            if not hasattr(r, 'text_content'):
                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
        return r
Beispiel #7
0
    def _build_doc(self):
        if _is_url(self.io):
            try:
                with contextlib.closing(urllib2.urlopen(self.io)) as url:
                    raw_text = url.read()
            except urllib2.URLError:
                raise ValueError('Invalid URL: "{0}"'.format(self.io))
        elif hasattr(self.io, 'read'):
            raw_text = self.io.read()
        elif os.path.isfile(self.io):
            with open(self.io) as f:
                raw_text = f.read()
        elif isinstance(self.io, basestring):
            raw_text = self.io
        else:
            raise ValueError("Cannot read object of"
                             " type '{0}'".format(type(self.io)))
        assert raw_text, 'No text parsed from document'

        from bs4 import BeautifulSoup, SoupStrainer
        strainer = SoupStrainer('table')
        return BeautifulSoup(raw_text, parse_only=strainer)
Beispiel #8
0
    def _build_doc(self):
        if _is_url(self.io):
            try:
                with contextlib.closing(urllib2.urlopen(self.io)) as url:
                    raw_text = url.read()
            except urllib2.URLError:
                raise ValueError('Invalid URL: "{0}"'.format(self.io))
        elif hasattr(self.io, 'read'):
            raw_text = self.io.read()
        elif os.path.isfile(self.io):
            with open(self.io) as f:
                raw_text = f.read()
        elif isinstance(self.io, basestring):
            raw_text = self.io
        else:
            raise ValueError("Cannot read object of"
                             " type '{0}'".format(type(self.io)))
        assert raw_text, 'No text parsed from document'

        from bs4 import BeautifulSoup, SoupStrainer
        strainer = SoupStrainer('table')
        return BeautifulSoup(raw_text, parse_only=strainer)
Beispiel #9
0
    def _build_doc(self):
        """
        Raises
        ------
        ValueError
            * If a URL that lxml cannot parse is passed.

        Exception
            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.html import parse, fromstring
        from lxml.html.clean import clean_html

        try:
            # try to parse the input in the simplest way
            r = parse(self.io)
        except (UnicodeDecodeError, IOError) as e:
            # if the input is a blob of html goop
            if not _is_url(self.io):
                r = fromstring(self.io)
            else:
                # not a url
                scheme = urlparse.urlparse(self.io).scheme
                if scheme not in _valid_schemes:
                    # lxml can't parse it
                    msg = ('{0} is not a valid url scheme, valid schemes are '
                           '{1}').format(scheme, _valid_schemes)
                    raise ValueError(msg)
                else:
                    # something else happened: maybe a faulty connection
                    raise e
        return clean_html(r)
Beispiel #10
0
    def _build_doc(self):
        """
        Raises
        ------
        ValueError
            * If a URL that lxml cannot parse is passed.

        Exception
            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.html import parse, fromstring
        from lxml.html.clean import clean_html

        try:
            # try to parse the input in the simplest way
            r = parse(self.io)
        except (UnicodeDecodeError, IOError) as e:
            # if the input is a blob of html goop
            if not _is_url(self.io):
                r = fromstring(self.io)
            else:
                # not a url
                scheme = urlparse.urlparse(self.io).scheme
                if scheme not in _valid_schemes:
                    # lxml can't parse it
                    msg = ('{0} is not a valid url scheme, valid schemes are '
                           '{1}').format(scheme, _valid_schemes)
                    raise ValueError(msg)
                else:
                    # something else happened: maybe a faulty connection
                    raise e
        return clean_html(r)
Beispiel #11
0
    def __init__(self, path_or_buf, encoding=None):
        super(StataReader, self).__init__(encoding)
        self.col_sizes = ()
        self._has_string_data = False
        self._missing_values = False
        self._data_read = False
        self._value_labels_read = False
        if isinstance(path_or_buf, str) and _is_url(path_or_buf):
            from urllib.request import urlopen
            path_or_buf = urlopen(path_or_buf)
            if py3compat.PY3:  # pragma: no cover
                if self._encoding:
                    errors = 'strict'
                else:
                    errors = 'replace'
                    self._encoding = 'cp1252'
                bytes = path_or_buf.read()
                self.path_or_buf = StringIO(self._decode_bytes(bytes, errors))
        elif type(path_or_buf) is str:
            self.path_or_buf = open(path_or_buf, 'rb')
        else:
            self.path_or_buf = path_or_buf

        self._read_header()
Beispiel #12
0
    def __init__(self, path_or_buf, encoding=None):
        super(StataReader, self).__init__(encoding)
        self.col_sizes = ()
        self._has_string_data = False
        self._missing_values = False
        self._data_read = False
        self._value_labels_read = False
        if isinstance(path_or_buf, str) and _is_url(path_or_buf):
            from urllib.request import urlopen
            path_or_buf = urlopen(path_or_buf)
            if py3compat.PY3:  # pragma: no cover
                if self._encoding:
                    errors = 'strict'
                else:
                    errors = 'replace'
                    self._encoding = 'cp1252'
                bytes = path_or_buf.read()
                self.path_or_buf = StringIO(self._decode_bytes(bytes, errors))
        elif type(path_or_buf) is str:
            self.path_or_buf = open(path_or_buf, 'rb')
        else:
            self.path_or_buf = path_or_buf

        self._read_header()