Ejemplo n.º 1
0
 def write(self, data):
     """Write method for the class."""
     # TODO - Check bytes vs unicode
     data = _as_bytes(data)
     # block_size = 2**16 = 65536
     data_len = len(data)
     if len(self._buffer) + data_len < 65536:
         # print("Cached %r" % data)
         self._buffer += data
         return
     else:
         # print("Got %r, writing out some data..." % data)
         self._buffer += data
         while len(self._buffer) >= 65536:
             self._write_block(self._buffer[:65536])
             self._buffer = self._buffer[65536:]
Ejemplo n.º 2
0
 def read(self, handle):
     """Set up the parser and let it parse the XML results"""
     # HACK: remove Bio._py3k handle conversion, since the Entrez XML parser
     # expects binary data
     if handle.__class__.__name__ == 'EvilHandleHack':
         handle = handle._handle
     if handle.__class__.__name__ == 'TextIOWrapper':
         handle = handle.buffer
     if hasattr(handle, "closed") and handle.closed:
         # Should avoid a possible Segmentation Fault, see:
         # http://bugs.python.org/issue4877
         raise IOError("Can't parse a closed handle")
     if sys.version_info[0] >= 3:
         # Another nasty hack to cope with a unicode StringIO handle
         # since the Entrez XML parser expects binary data (bytes)
         from io import StringIO
         if isinstance(handle, StringIO):
             from io import BytesIO
             from py3k import _as_bytes
             handle = BytesIO(_as_bytes(handle.read()))
     try:
         self.parser.ParseFile(handle)
     except expat.ExpatError as e:
         if self.parser.StartElementHandler:
             # We saw the initial <!xml declaration, so we can be sure that
             # we are parsing XML data. Most likely, the XML file is
             # corrupted.
             raise CorruptedXMLError(e)
         else:
             # We have not seen the initial <!xml declaration, so probably
             # the input data is not in XML format.
             raise NotXMLError(e)
     try:
         return self.object
     except AttributeError:
         if self.parser.StartElementHandler:
             # We saw the initial <!xml declaration, and expat didn't notice
             # any errors, so self.object should be defined. If not, this is
             # a bug.
             raise RuntimeError(
                 "Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at [email protected] for assistance."
             )
         else:
             # We did not see the initial <!xml declaration, so probably
             # the input data is not in XML format.
             raise NotXMLError("XML declaration not found")
Ejemplo n.º 3
0
 def read(self, handle):
     """Set up the parser and let it parse the XML results"""
     # HACK: remove Bio._py3k handle conversion, since the Entrez XML parser
     # expects binary data
     if handle.__class__.__name__ == 'EvilHandleHack':
         handle = handle._handle
     if handle.__class__.__name__ == 'TextIOWrapper':
         handle = handle.buffer
     if hasattr(handle, "closed") and handle.closed:
         # Should avoid a possible Segmentation Fault, see:
         # http://bugs.python.org/issue4877
         raise IOError("Can't parse a closed handle")
     if sys.version_info[0] >= 3:
         # Another nasty hack to cope with a unicode StringIO handle
         # since the Entrez XML parser expects binary data (bytes)
         from io import StringIO
         if isinstance(handle, StringIO):
             from io import BytesIO
             from py3k import _as_bytes
             handle = BytesIO(_as_bytes(handle.read()))
     try:
         self.parser.ParseFile(handle)
     except expat.ExpatError as e:
         if self.parser.StartElementHandler:
             # We saw the initial <!xml declaration, so we can be sure that
             # we are parsing XML data. Most likely, the XML file is
             # corrupted.
             raise CorruptedXMLError(e)
         else:
             # We have not seen the initial <!xml declaration, so probably
             # the input data is not in XML format.
             raise NotXMLError(e)
     try:
         return self.object
     except AttributeError:
         if self.parser.StartElementHandler:
             # We saw the initial <!xml declaration, and expat didn't notice
             # any errors, so self.object should be defined. If not, this is
             # a bug.
             raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at [email protected] for assistance.")
         else:
             # We did not see the initial <!xml declaration, so probably
             # the input data is not in XML format.
             raise NotXMLError("XML declaration not found")
Ejemplo n.º 4
0
def _open(cgi, params=None, post=None, ecitmatch=False):
    """Build the URL and open a handle to it (PRIVATE).

    Open a handle to Entrez.  cgi is the URL for the cgi script to access.
    params is a dictionary with the options to pass to it.  Does some
    simple error checking, and will raise an IOError if it encounters one.

    The arugment post should be a boolean to explicitly control if an HTTP
    POST should be used rather an HTTP GET based on the query length.
    By default (post=None), POST is used if the URL encoded paramters would
    be over 1000 characters long.

    This function also enforces the "up to three queries per second rule"
    to avoid abusing the NCBI servers.
    """
    # NCBI requirement: At most three queries per second if no API key is provided.
    # Equivalently, at least a third of second between queries
    params = _construct_params(params)
    options = _encode_options(ecitmatch, params)
    delay = 0.1 if api_key else 0.333333334
    current = time.time()
    wait = _open.previous + delay - current
    if wait > 0:
        time.sleep(wait)
        _open.previous = current + wait
    else:
        _open.previous = current

    # By default, post is None. Set to a boolean to over-ride length choice:
    if post is None and len(options) > 1000:
        post = True
    cgi = _construct_cgi(cgi, post, options)

    try:
        if post:
            handle = _urlopen(cgi, data=_as_bytes(options))
        else:
            handle = _urlopen(cgi)
    except _HTTPError as exception:
        raise exception

    return _binary_to_string_handle(handle)
Ejemplo n.º 5
0
def _open(cgi, params=None, post=None, ecitmatch=False):
    """Helper function to build the URL and open a handle to it (PRIVATE).

    Open a handle to Entrez.  cgi is the URL for the cgi script to access.
    params is a dictionary with the options to pass to it.  Does some
    simple error checking, and will raise an IOError if it encounters one.

    The arugment post should be a boolean to explicitly control if an HTTP
    POST should be used rather an HTTP GET based on the query length.
    By default (post=None), POST is used if the query URL would be over
    1000 characters long.

    The arugment post should be a boolean to explicitly control if an HTTP
    POST should be used rather an HTTP GET based on the query length.

    This function also enforces the "up to three queries per second rule"
    to avoid abusing the NCBI servers.
    """
    if params is None:
        params = {}
    # NCBI requirement: At most three queries per second.
    # Equivalently, at least a third of second between queries
    delay = 0.333333334
    current = time.time()
    wait = _open.previous + delay - current
    if wait > 0:
        time.sleep(wait)
        _open.previous = current + wait
    else:
        _open.previous = current
    # Remove None values from the parameters
    for key, value in list(params.items()):
        if value is None:
            del params[key]
    # Tell Entrez that we are using Biopython (or whatever the user has
    # specified explicitly in the parameters or by changing the default)
    if "tool" not in params:
        params["tool"] = tool
    # Tell Entrez who we are
    if "email" not in params:
        if email is not None:
            params["email"] = email
        else:
            warnings.warn("""
Email address is not specified.

To make use of NCBI's E-utilities, NCBI requires you to specify your
email address with each request.  As an example, if your email address
is [email protected], you can specify it as follows:
   from Bio import Entrez
   Entrez.email = '*****@*****.**'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.""", UserWarning)

    # Open a handle to Entrez.
    options = _urlencode(params, doseq=True)
    # _urlencode encodes pipes, which NCBI expects in ECitMatch
    if ecitmatch:
        options = options.replace('%7C', '|')
    # print cgi + "?" + options

    # By default, post is None. Set to a boolean to over-ride length choice:
    if post is None and len(options) > 1000:
        post = True
    try:
        if post:
            # HTTP POST
            handle = _urlopen(cgi, data=_as_bytes(options))
        else:
            # HTTP GET
            cgi += "?" + options
            handle = _urlopen(cgi)
    except _HTTPError as exception:
        raise exception

    return _binary_to_string_handle(handle)
Ejemplo n.º 6
0
def _open(cgi, params=None, post=None, ecitmatch=False):
    """Helper function to build the URL and open a handle to it (PRIVATE).

    Open a handle to Entrez.  cgi is the URL for the cgi script to access.
    params is a dictionary with the options to pass to it.  Does some
    simple error checking, and will raise an IOError if it encounters one.

    The arugment post should be a boolean to explicitly control if an HTTP
    POST should be used rather an HTTP GET based on the query length.
    By default (post=None), POST is used if the query URL would be over
    1000 characters long.

    The arugment post should be a boolean to explicitly control if an HTTP
    POST should be used rather an HTTP GET based on the query length.

    This function also enforces the "up to three queries per second rule"
    to avoid abusing the NCBI servers.
    """
    if params is None:
        params = {}
    # NCBI requirement: At most three queries per second.
    # Equivalently, at least a third of second between queries
    delay = 0.333333334
    current = time.time()
    wait = _open.previous + delay - current
    if wait > 0:
        time.sleep(wait)
        _open.previous = current + wait
    else:
        _open.previous = current
    # Remove None values from the parameters
    for key, value in list(params.items()):
        if value is None:
            del params[key]
    # Tell Entrez that we are using Biopython (or whatever the user has
    # specified explicitly in the parameters or by changing the default)
    if "tool" not in params:
        params["tool"] = tool
    # Tell Entrez who we are
    if "email" not in params:
        if email is not None:
            params["email"] = email
        else:
            warnings.warn(
                """
Email address is not specified.

To make use of NCBI's E-utilities, NCBI requires you to specify your
email address with each request.  As an example, if your email address
is [email protected], you can specify it as follows:
   from Bio import Entrez
   Entrez.email = '*****@*****.**'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.""", UserWarning)

    # Open a handle to Entrez.
    options = _urlencode(params, doseq=True)
    # _urlencode encodes pipes, which NCBI expects in ECitMatch
    if ecitmatch:
        options = options.replace('%7C', '|')
    # print cgi + "?" + options

    # By default, post is None. Set to a boolean to over-ride length choice:
    if post is None and len(options) > 1000:
        post = True
    try:
        if post:
            # HTTP POST
            handle = _urlopen(cgi, data=_as_bytes(options))
        else:
            # HTTP GET
            cgi += "?" + options
            handle = _urlopen(cgi)
    except _HTTPError as exception:
        raise exception

    return _binary_to_string_handle(handle)