Esempio n. 1
0
def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
    """
    If the filepath_or_buffer is a url, translate and return the buffer
    passthru otherwise.

    Parameters
    ----------
    filepath_or_buffer : a url, filepath, or buffer
    encoding : the encoding to use to decode py3 bytes, default is 'utf-8'

    Returns
    -------
    a filepath_or_buffer, the encoding
    """

    if _is_url(filepath_or_buffer):
        req = _urlopen(str(filepath_or_buffer))
        return maybe_read_encoded_stream(req, encoding)

    if _is_s3_url(filepath_or_buffer):
        try:
            import boto
        except:
            raise ImportError("boto is required to handle s3 files")
        # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
        # are environment variables
        parsed_url = parse_url(filepath_or_buffer)
        conn = boto.connect_s3()
        b = conn.get_bucket(parsed_url.netloc)
        k = boto.s3.key.Key(b)
        k.key = parsed_url.path
        filepath_or_buffer = StringIO(k.get_contents_as_string())
        return filepath_or_buffer, None

    return filepath_or_buffer, None
Esempio n. 2
0
def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
    """
    If the filepath_or_buffer is a url, translate and return the buffer
    passthru otherwise.

    Parameters
    ----------
    filepath_or_buffer : a url, filepath, or buffer
    encoding : the encoding to use to decode py3 bytes, default is 'utf-8'

    Returns
    -------
    a filepath_or_buffer, the encoding
    """

    if _is_url(filepath_or_buffer):
        req = _urlopen(str(filepath_or_buffer))
        return maybe_read_encoded_stream(req,encoding)

    if _is_s3_url(filepath_or_buffer):
        try:
            import boto
        except:
            raise ImportError("boto is required to handle s3 files")
        # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
        # are environment variables
        parsed_url = parse_url(filepath_or_buffer)
        conn = boto.connect_s3()
        b = conn.get_bucket(parsed_url.netloc)
        k = boto.s3.key.Key(b)
        k.key = parsed_url.path
        filepath_or_buffer = StringIO(k.get_contents_as_string())
        return filepath_or_buffer, None

    return filepath_or_buffer, None
Esempio n. 3
0
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
                           compression=None):
    """
    If the filepath_or_buffer is a url, translate and return the buffer
    passthru otherwise.

    Parameters
    ----------
    filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
                         or buffer
    encoding : the encoding to use to decode py3 bytes, default is 'utf-8'

    Returns
    -------
    a filepath_or_buffer, the encoding, the compression
    """

    if _is_url(filepath_or_buffer):
        req = _urlopen(str(filepath_or_buffer))
        if compression == 'infer':
            content_encoding = req.headers.get('Content-Encoding', None)
            if content_encoding == 'gzip':
                compression = 'gzip'
            else:
                compression = None
        # cat on the compression to the tuple returned by the function
        to_return = list(maybe_read_encoded_stream(req, encoding, compression)) + \
                    [compression]
        return tuple(to_return)

    if _is_s3_url(filepath_or_buffer):
        try:
            import boto
        except:
            raise ImportError("boto is required to handle s3 files")
        # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
        # are environment variables
        parsed_url = parse_url(filepath_or_buffer)

        try:
            conn = boto.connect_s3()
        except boto.exception.NoAuthHandlerFound:
            conn = boto.connect_s3(anon=True)

        b = conn.get_bucket(parsed_url.netloc, validate=False)
        if compat.PY2 and (compression == 'gzip' or
                           (compression == 'infer' and
                            filepath_or_buffer.endswith(".gz"))):
            k = boto.s3.key.Key(b, parsed_url.path)
            filepath_or_buffer = BytesIO(k.get_contents_as_string(
                encoding=encoding))
        else:
            k = BotoFileLikeReader(b, parsed_url.path, encoding=encoding)
            k.open('r')  # Expose read errors immediately
            filepath_or_buffer = k
        return filepath_or_buffer, None, compression

    # It is a pathlib.Path/py.path.local or string
    filepath_or_buffer = _stringify_path(filepath_or_buffer)
    return _expand_user(filepath_or_buffer), None, compression
Esempio n. 4
0
def urlopen(url):
    sys.stdout.flush()
    url = url.replace('&max-results=20', '&max-results=100')
    if '&key' not in url:
        url += key
    print url
    return _urlopen(url, timeout=60).read()
Esempio n. 5
0
    def get_recent_changes(self):
        """Returns three lists of the newest weekly files (added,mod,obsolete).

        Reads the directories with changed entries from the PDB server and
        returns a tuple of three URL's to the files of new, modified and
        obsolete entries from the most recent list. The directory with the
        largest numerical name is used.
        Returns None if something goes wrong.

        Contents of the data/status dir (20031013 would be used);
        drwxrwxr-x   2 1002     sysadmin     512 Oct  6 18:28 20031006
        drwxrwxr-x   2 1002     sysadmin     512 Oct 14 02:14 20031013
        -rw-r--r--   1 1002     sysadmin    1327 Mar 12  2001 README
        """
        url = self.pdb_server + '/pub/pdb/data/status/'
        with contextlib.closing(_urlopen(url)) as handle:
            recent = filter(str.isdigit,
                            (x.split()[-1] for x in handle.readlines())
                            )[-1]

        path = self.pdb_server + '/pub/pdb/data/status/%s/' % (recent)

        # Retrieve the lists
        added = self.get_status_list(path + 'added.pdb')
        modified = self.get_status_list(path + 'modified.pdb')
        obsolete = self.get_status_list(path + 'obsolete.pdb')
        return [added, modified, obsolete]
Esempio n. 6
0
    def get_all_obsolete(self):
        """Returns a list of all obsolete entries ever in the PDB.

        Returns a list of all obsolete pdb codes that have ever been
        in the PDB.

        Gets and parses the file from the PDB server in the format
        (the first pdb_code column is the one used). The file looks
        like this:

         LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS
        OBSLTE    31-JUL-94 116L     216L
        ...
        OBSLTE    29-JAN-96 1HFT     2HFT
        OBSLTE    21-SEP-06 1HFV     2J5X
        OBSLTE    21-NOV-03 1HG6
        OBSLTE    18-JUL-84 1HHB     2HHB 3HHB
        OBSLTE    08-NOV-96 1HID     2HID
        OBSLTE    01-APR-97 1HIU     2HIU
        OBSLTE    14-JAN-04 1HKE     1UUZ
        ...

        """
        url = self.pdb_server + '/pub/pdb/data/status/obsolete.dat'
        with contextlib.closing(_urlopen(url)) as handle:
            # Extract pdb codes. Could use a list comprehension, but I want
            # to include an assert to check for mis-reading the data.
            obsolete = []
            for line in handle:
                if not line.startswith("OBSLTE "):
                    continue
                pdb = line.split()[2]
                assert len(pdb) == 4
                obsolete.append(pdb)
        return obsolete
Esempio n. 7
0
    def get_all_obsolete(self):
        """Returns a list of all obsolete entries ever in the PDB.

        Returns a list of all obsolete pdb codes that have ever been
        in the PDB.

        Gets and parses the file from the PDB server in the format
        (the first pdb_code column is the one used). The file looks
        like this:

         LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS
        OBSLTE    31-JUL-94 116L     216L
        ...
        OBSLTE    29-JAN-96 1HFT     2HFT
        OBSLTE    21-SEP-06 1HFV     2J5X
        OBSLTE    21-NOV-03 1HG6
        OBSLTE    18-JUL-84 1HHB     2HHB 3HHB
        OBSLTE    08-NOV-96 1HID     2HID
        OBSLTE    01-APR-97 1HIU     2HIU
        OBSLTE    14-JAN-04 1HKE     1UUZ
        ...

        """
        url = self.pdb_server + '/pub/pdb/data/status/obsolete.dat'
        with contextlib.closing(_urlopen(url)) as handle:
            # Extract pdb codes. Could use a list comprehension, but I want
            # to include an assert to check for mis-reading the data.
            obsolete = []
            for line in handle:
                if not line.startswith("OBSLTE "):
                    continue
                pdb = line.split()[2]
                assert len(pdb) == 4
                obsolete.append(pdb)
        return obsolete
    def _update_usa(self):
        """
        Update whitelist based on usa.gov
        """
        print 'Getting agencies from usa.gov...'

        url_base = 'https://www.usa.gov'
        letters = _string.ascii_lowercase

        agency_dic = {}

        for letter in letters:
            url = url_base + '/federal-agencies/' + letter
            soup = _BeautifulSoup(_urlopen(url).read())

            links_content = [l for l in soup.find_all('ul') if 'class' in l.attrs and 'one_column_bullet' in l['class']]
            if len(links_content) == 1:
                links_list = links_content[0].find_all('a')
                for agency_html in links_list:
                    name_short = self._preprocess_name(agency_html.string)
                    agency_dic[name_short] = {'html': agency_html,
                                              'url': url_base + agency_html['href'],
                                              'name_full': agency_html.string,
                                              'source': 'usa.gov'}

                    print agency_html.string

            elif len(links_content) == 0:
                pass

            else:
                raise ValueError('Too many list elements found! Please modify the HTML parser.')

        self.agency_dictionary.update(agency_dic)
Esempio n. 9
0
    def get_recent_changes(self):
        """Returns three lists of the newest weekly files (added,mod,obsolete).

        Reads the directories with changed entries from the PDB server and
        returns a tuple of three URL's to the files of new, modified and
        obsolete entries from the most recent list. The directory with the
        largest numerical name is used.
        Returns None if something goes wrong.

        Contents of the data/status dir (20031013 would be used);
        drwxrwxr-x   2 1002     sysadmin     512 Oct  6 18:28 20031006
        drwxrwxr-x   2 1002     sysadmin     512 Oct 14 02:14 20031013
        -rw-r--r--   1 1002     sysadmin    1327 Mar 12  2001 README
        """
        url = self.pdb_server + '/pub/pdb/data/status/'
        with contextlib.closing(_urlopen(url)) as handle:
            recent = filter(str.isdigit,
                            (x.split()[-1] for x in handle.readlines()))[-1]

        path = self.pdb_server + '/pub/pdb/data/status/%s/' % (recent)

        # Retrieve the lists
        added = self.get_status_list(path + 'added.pdb')
        modified = self.get_status_list(path + 'modified.pdb')
        obsolete = self.get_status_list(path + 'obsolete.pdb')
        return [added, modified, obsolete]
def urlopen(url):
    sys.stdout.flush()
    url = url.replace('&max-results=20', '&max-results=100')
    if '&key' not in url:
        url += key
    print url
    return _urlopen(url, timeout=60).read()
Esempio n. 11
0
def urlopen(url, data=None, lang='en'):
    request = Request(url, data, {
        "Accept-Language": "%s,en-us;q=0.7,en;q=0.3"%lang.lower(),
        "User-Agent": UA,
    })
    logging.debug("urlopen: %s", url)
    time.sleep(URLOPEN_DELAY)
    return _urlopen(request)
Esempio n. 12
0
 def get_all_entries(self):
     """Retrieves a big file containing all the 
     PDB entries and some annotation to them. 
     Returns a list of PDB codes in the index file.
     """
     print "retrieving index file. Takes about 5 MB."
     url = _urlopen(self.pdb_server + "/pub/pdb/derived_data/index/entries.idx")
     return [line[:4] for line in url.readlines()[2:] if len(line) > 4]
Esempio n. 13
0
def fetch_film_info_from_criticker(film_data):
    url = 'http://www.criticker.com/?f=' + film_data['criticker_id']
    title_page = None
    try:
        page = unicode(_urlopen(url, None, 5).read(), 'iso-8859-1')
        soup = BeautifulSoup(page)
        title_page = soup.find("div", attrs={"id":"fi_info_filmname"})
    except URLError, e:
        logger.error("URL Error: " + str(e.reason) + ": " + url)
Esempio n. 14
0
 def get_all_entries(self):
     """Retrieves a big file containing all the 
     PDB entries and some annotation to them. 
     Returns a list of PDB codes in the index file.
     """
     print "retrieving index file. Takes about 5 MB."
     url = _urlopen(self.pdb_server +
                    '/pub/pdb/derived_data/index/entries.idx')
     return [line[:4] for line in url.readlines()[2:] if len(line) > 4]
Esempio n. 15
0
def get_filepath_or_buffer(filepath_or_buffer,
                           encoding=None,
                           compression=None,
                           mode=None):
    """
    If the filepath_or_buffer is a url, translate and return the buffer.
    Otherwise passthrough.

    Parameters
    ----------
    filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
                         or buffer
    encoding : the encoding to use to decode py3 bytes, default is 'utf-8'
    mode : str, optional

    Returns
    -------
    tuple of ({a filepath_ or buffer or S3File instance},
              encoding, str,
              compression, str,
              should_close, bool)
    """
    filepath_or_buffer = _stringify_path(filepath_or_buffer)

    if _is_url(filepath_or_buffer):
        req = _urlopen(filepath_or_buffer)
        content_encoding = req.headers.get('Content-Encoding', None)
        if content_encoding == 'gzip':
            # Override compression based on Content-Encoding header
            compression = 'gzip'
        reader = BytesIO(req.read())
        req.close()
        return reader, encoding, compression, True

    if is_s3_url(filepath_or_buffer):
        from pandas.io import s3
        return s3.get_filepath_or_buffer(filepath_or_buffer,
                                         encoding=encoding,
                                         compression=compression,
                                         mode=mode)

    if is_gcs_url(filepath_or_buffer):
        from pandas.io import gcs
        return gcs.get_filepath_or_buffer(filepath_or_buffer,
                                          encoding=encoding,
                                          compression=compression,
                                          mode=mode)

    if isinstance(filepath_or_buffer,
                  (compat.string_types, compat.binary_type, mmap.mmap)):
        return _expand_user(filepath_or_buffer), None, compression, False

    if not is_file_like(filepath_or_buffer):
        msg = "Invalid file path or buffer object type: {_type}"
        raise ValueError(msg.format(_type=type(filepath_or_buffer)))

    return filepath_or_buffer, None, compression, False
Esempio n. 16
0
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
                           compression=None):
    """
    If the filepath_or_buffer is a url, translate and return the buffer
    passthru otherwise.

    Parameters
    ----------
    filepath_or_buffer : a url, filepath, or buffer
    encoding : the encoding to use to decode py3 bytes, default is 'utf-8'

    Returns
    -------
    a filepath_or_buffer, the encoding, the compression
    """

    if _is_url(filepath_or_buffer):
        req = _urlopen(str(filepath_or_buffer))
        if compression == 'infer':
            content_encoding = req.headers.get('Content-Encoding', None)
            if content_encoding == 'gzip':
                compression = 'gzip'
            else:
                compression = None
        # cat on the compression to the tuple returned by the function
        to_return = list(maybe_read_encoded_stream(req, encoding, compression)) + \
                    [compression]
        return tuple(to_return)

    if _is_s3_url(filepath_or_buffer):
        try:
            import boto
        except:
            raise ImportError("boto is required to handle s3 files")
        # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
        # are environment variables
        parsed_url = parse_url(filepath_or_buffer)

        try:
            conn = boto.connect_s3()
        except boto.exception.NoAuthHandlerFound:
            conn = boto.connect_s3(anon=True)

        b = conn.get_bucket(parsed_url.netloc, validate=False)
        if compat.PY2 and (compression == 'gzip' or
                           (compression == 'infer' and
                            filepath_or_buffer.endswith(".gz"))):
            k = boto.s3.key.Key(b, parsed_url.path)
            filepath_or_buffer = BytesIO(k.get_contents_as_string(
                encoding=encoding))
        else:
            k = BotoFileLikeReader(b, parsed_url.path, encoding=encoding)
            k.open('r')  # Expose read errors immediately
            filepath_or_buffer = k
        return filepath_or_buffer, None, compression

    return _expand_user(filepath_or_buffer), None, compression
Esempio n. 17
0
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
                           compression=None, mode=None):
    """
    If the filepath_or_buffer is a url, translate and return the buffer.
    Otherwise passthrough.

    Parameters
    ----------
    filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
                         or buffer
    encoding : the encoding to use to decode py3 bytes, default is 'utf-8'
    mode : str, optional

    Returns
    -------
    tuple of ({a filepath_ or buffer or S3File instance},
              encoding, str,
              compression, str,
              should_close, bool)
    """
    filepath_or_buffer = _stringify_path(filepath_or_buffer)

    if _is_url(filepath_or_buffer):
        req = _urlopen(filepath_or_buffer)
        content_encoding = req.headers.get('Content-Encoding', None)
        if content_encoding == 'gzip':
            # Override compression based on Content-Encoding header
            compression = 'gzip'
        reader = BytesIO(req.read())
        req.close()
        return reader, encoding, compression, True

    if is_s3_url(filepath_or_buffer):
        from pandas.io import s3
        return s3.get_filepath_or_buffer(filepath_or_buffer,
                                         encoding=encoding,
                                         compression=compression,
                                         mode=mode)

    if is_gcs_url(filepath_or_buffer):
        from pandas.io import gcs
        return gcs.get_filepath_or_buffer(filepath_or_buffer,
                                          encoding=encoding,
                                          compression=compression,
                                          mode=mode)

    if isinstance(filepath_or_buffer, (compat.string_types,
                                       compat.binary_type,
                                       mmap.mmap)):
        return _expand_user(filepath_or_buffer), None, compression, False

    if not is_file_like(filepath_or_buffer):
        msg = "Invalid file path or buffer object type: {_type}"
        raise ValueError(msg.format(_type=type(filepath_or_buffer)))

    return filepath_or_buffer, None, compression, False
Esempio n. 18
0
def load(filename):
    if filename[:4] == 'http':
        resp = _urlopen(filename)
        dict = _json.loads(resp.read(), object_pairs_hook=_parse_json)
    else:
        filename = os.path.expanduser(filename)
        f = open(filename, 'r')
        dict = _json.load(f, object_pairs_hook=_parse_json)
        f.close()

    return dict
Esempio n. 19
0
 def get_all_entries(self):
     """Retrieves a big file containing all the
     PDB entries and some annotation to them.
     Returns a list of PDB codes in the index file.
     """
     print("retrieving index file. Takes about 5 MB.")
     url = self.pdb_server + '/pub/pdb/derived_data/index/entries.idx'
     with contextlib.closing(_urlopen(url)) as handle:
         all_entries = [line[:4] for line in handle.readlines()[2:]
                        if len(line) > 4]
     return all_entries
Esempio n. 20
0
 def get_seqres_file(self, savefile="pdb_seqres.txt"):
     """Retrieves a (big) file containing all the sequences of PDB entries
     and writes it to a file.
     """
     print "retrieving sequence file. Takes about 15 MB."
     handle = _urlopen(self.pdb_server + "/pub/pdb/derived_data/pdb_seqres.txt")
     lines = handle.readlines()
     outfile = open(savefile, "w")
     outfile.writelines(lines)
     outfile.close()
     handle.close()
    def _update_wikipedia(self):
        # do a little bit of name preprocessing here too
        from requests import ConnectionError
        from wikipedia import PageError

        print 'Getting data from Wikipedia...'

        page_current = _wikipedia.page('List_of_federal_agencies_in_the_United_States')
        html = page_current.html()
        subset = html[_re.search('<h2>.*?Legislative Branch', html).start():_re.search('<h2>.*?See also', html).start()]
        soup = _BeautifulSoup(subset)

        links = soup.find_all(lambda x: x.name == 'a' and x.has_attr('href') and '/wiki/' in x['href'] and
                              'File:' not in x['href'])

        agency_dic = {self._preprocess_name(link['title']): {'html': link,
                                                             'url': 'https://en.wikipedia.org' + link['href'],
                                                             'name_full': link['title'],
                                                             'source': 'wikipedia'}
                      for link in links}

        category_pages = ['https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&' +
                          'cmtitle=Category:Defunct_agencies_of_the_United_States_government&cmlimit=500&format=json',
                          'https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&' +
                          'cmtitle=Category:Committees_of_the_United_States_Senate&cmlimit=500&format=json',
                          'https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&' +
                          'cmtitle=Category:Joint_committees_of_the_United_States_Congress' +
                          '&cmlimit=500&format=json',
                          'https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&' +
                          'cmtitle=Category:Committees_of_the_United_States_House_of_Representatives' +
                          '&cmlimit=500&format=json',
                          ]

        for category_page in category_pages:
            content_defunct = _json.loads(_urlopen(category_page).read())

            for result in content_defunct['query']['categorymembers']:
                if result['ns'] == 0:
                    url_defunct = 'https://en.wikipedia.org/wiki/' + _re.sub(' ', '_', result['title'])
                    print(result['title'])
                    try:
                        page_defunct = _wikipedia.page(result['title'])

                        name_short = self._preprocess_name(result['title'])

                        agency_dic[name_short] = {'html': page_defunct.html(),
                                                  'url': url_defunct,
                                                  'name_full': result['title'],
                                                  'source': 'wikipedia'}

                    except (ConnectionError, PageError):
                        print('Failed to get agency HTML!')

        self.agency_dictionary.update(agency_dic)
Esempio n. 22
0
def urlopen(url, data=None, *args, **kwargs):
    if not isinstance(url, Request):
        url = Request(url, data)
        data = None
    if 'basic_auth' in kwargs:
        if kwargs['basic_auth']:
            a = base64.b64encode(':'.join(kwargs['basic_auth']))
            url.add_header('Authorization', 'Basic '+a)
        del(kwargs['basic_auth'])
    if 'authorization' in kwargs:
        if kwargs['authorization']:
            url.add_header('Authorization', kwargs['authorization'])
        del(kwargs['authorization'])
    if sys.version_info[0] == 2:
        url.add_header('Host', url.get_origin_req_host())
        return _urlopen(url, data, *args, **kwargs)
    else:
        url.add_header('Host', url.origin_req_host)
        kwargs['cadefaults'] = True
        return _urlopen(url, data, *args, **kwargs)
Esempio n. 23
0
        def get_xml(xml_link):
            try:
                xml_data = _urlopen(xml_link).read()

                if 'xml' in xml_data[0:100]:
                    return xml_data
                else:
                    return None

            except HTTPError:
                return None
Esempio n. 24
0
 def get_seqres_file(self,savefile='pdb_seqres.txt'):
     """Retrieves a (big) file containing all the sequences of PDB entries
     and writes it to a file.
     """
     print "retrieving sequence file. Takes about 15 MB."
     handle = _urlopen(self.pdb_server + 
                       '/pub/pdb/derived_data/pdb_seqres.txt')
     lines = handle.readlines()
     outfile = open(savefile, 'w')
     outfile.writelines(lines)
     outfile.close()
     handle.close()
Esempio n. 25
0
 def get_all_entries(self):
     """Retrieves a big file containing all the
     PDB entries and some annotation to them.
     Returns a list of PDB codes in the index file.
     """
     print("retrieving index file. Takes about 5 MB.")
     url = self.pdb_server + '/pub/pdb/derived_data/index/entries.idx'
     with contextlib.closing(_urlopen(url)) as handle:
         all_entries = [
             line[:4] for line in handle.readlines()[2:] if len(line) > 4
         ]
     return all_entries
Esempio n. 26
0
def get_filepath_or_buffer(filepath_or_buffer,
                           encoding=None,
                           compression=None):
    """
    If the filepath_or_buffer is a url, translate and return the buffer
    passthru otherwise.

    Parameters
    ----------
    filepath_or_buffer : a url, filepath, or buffer
    encoding : the encoding to use to decode py3 bytes, default is 'utf-8'

    Returns
    -------
    a filepath_or_buffer, the encoding, the compression
    """

    if _is_url(filepath_or_buffer):
        req = _urlopen(str(filepath_or_buffer))
        if compression == 'infer':
            content_encoding = req.headers.get('Content-Encoding', None)
            if content_encoding == 'gzip':
                compression = 'gzip'
        # cat on the compression to the tuple returned by the function
        to_return = list(maybe_read_encoded_stream(req, encoding, compression)) + \
                    [compression]
        return tuple(to_return)

    if _is_s3_url(filepath_or_buffer):
        try:
            import boto
        except:
            raise ImportError("boto is required to handle s3 files")
        # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
        # are environment variables
        parsed_url = parse_url(filepath_or_buffer)

        try:
            conn = boto.connect_s3()
        except boto.exception.NoAuthHandlerFound:
            conn = boto.connect_s3(anon=True)

        b = conn.get_bucket(parsed_url.netloc, validate=False)
        k = boto.s3.key.Key(b)
        k.key = parsed_url.path
        filepath_or_buffer = BytesIO(
            k.get_contents_as_string(encoding=encoding))
        return filepath_or_buffer, None, compression

    return _expand_user(filepath_or_buffer), None, compression
Esempio n. 27
0
    def get_status_list(self, url):
        """Retrieves a list of pdb codes in the weekly pdb status file
        from the given URL. Used by get_recent_files.

        Typical contents of the list files parsed by this method is now
        very simply one PDB name per line.
        """
        with contextlib.closing(_urlopen(url)) as handle:
            answer = []
            for line in handle:
                pdb = line.strip()
                assert len(pdb) == 4
                answer.append(pdb)
        return answer
Esempio n. 28
0
    def get_status_list(self, url):
        """Retrieves a list of pdb codes in the weekly pdb status file
        from the given URL. Used by get_recent_files.

        Typical contents of the list files parsed by this method is now
        very simply one PDB name per line.
        """
        with contextlib.closing(_urlopen(url)) as handle:
            answer = []
            for line in handle:
                pdb = line.strip()
                assert len(pdb) == 4
                answer.append(pdb)
        return answer
Esempio n. 29
0
    def _get_ids(self):
        id_vals = []
        years = range(1988, 2017)

        for year in years:
            soup = _BeautifulSoup(
                _urlopen(
                    'http://www.legislation.gov.uk/ukpga/{0}'.format(year)))
            n_results = _re.search('has returned ([0-9]+) results',
                                   soup.text.lower()).group(1)
            id_vals += [
                str(year) + '_' + str(i) for i in range(1,
                                                        int(n_results) + 1)
            ]

        return id_vals
Esempio n. 30
0
    def _get_data(self, publication_id):

        max_attempts = 10
        attempts = 0

        xml_content = None
        soup = None

        while attempts < max_attempts:
            search_id = _re.sub('_', '/', publication_id)
            try:
                xml_content = _urlopen(
                    'http://www.legislation.gov.uk/ukpga/{0}/data.xml'.format(
                        search_id)).read()
                soup = _BeautifulSoup(xml_content, 'xml')
                break
            except:
                attempts += 1

        if 'amendment' in soup.title.text.lower():
            amend = True
        else:
            amend = False

        if 'repeal' in soup.title.text.lower():
            repeal = True
        else:
            repeal = False

        if soup.EnactmentDate is not None:
            date = soup.EnactmentDate['Date']
        elif soup.PrimaryPrelims is not None:
            date = soup.PrimaryPrelims['RestrictStartDate']
        else:
            date = None
            print 'warning! No date found.'

        meta = _format_meta_entry(country=u'united_kingdom',
                                  title=soup.title.text,
                                  id=publication_id,
                                  date=date,
                                  type=u'annual',
                                  xml=xml_content,
                                  amendment=amend,
                                  repealed=repeal)

        return meta
Esempio n. 31
0
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
                           compression=None):
    """
    If the filepath_or_buffer is a url, translate and return the buffer.
    Otherwise passthrough.

    Parameters
    ----------
    filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
                         or buffer
    encoding : the encoding to use to decode py3 bytes, default is 'utf-8'

    Returns
    -------
    a filepath_or_buffer, the encoding, the compression
    """

    if _is_url(filepath_or_buffer):
        url = str(filepath_or_buffer)
        req = _urlopen(url)
        content_encoding = req.headers.get('Content-Encoding', None)
        if content_encoding == 'gzip':
            # Override compression based on Content-Encoding header
            compression = 'gzip'
        reader = BytesIO(req.read())
        return reader, encoding, compression

    if _is_s3_url(filepath_or_buffer):
        from pandas.io import s3
        return s3.get_filepath_or_buffer(filepath_or_buffer,
                                         encoding=encoding,
                                         compression=compression)

    # Convert pathlib.Path/py.path.local or string
    filepath_or_buffer = _stringify_path(filepath_or_buffer)

    if isinstance(filepath_or_buffer, (compat.string_types,
                                       compat.binary_type,
                                       mmap.mmap)):
        return _expand_user(filepath_or_buffer), None, compression

    if not is_file_like(filepath_or_buffer):
        msg = "Invalid file path or buffer object type: {_type}"
        raise ValueError(msg.format(_type=type(filepath_or_buffer)))

    return filepath_or_buffer, None, compression
Esempio n. 32
0
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
                           compression=None):
    """
    If the filepath_or_buffer is a url, translate and return the buffer.
    Otherwise passthrough.

    Parameters
    ----------
    filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
                         or buffer
    encoding : the encoding to use to decode py3 bytes, default is 'utf-8'

    Returns
    -------
    a filepath_or_buffer, the encoding, the compression
    """

    if _is_url(filepath_or_buffer):
        url = str(filepath_or_buffer)
        req = _urlopen(url)
        content_encoding = req.headers.get('Content-Encoding', None)
        if content_encoding == 'gzip':
            # Override compression based on Content-Encoding header
            compression = 'gzip'
        reader = BytesIO(req.read())
        return reader, encoding, compression

    if _is_s3_url(filepath_or_buffer):
        from pandas.io import s3
        return s3.get_filepath_or_buffer(filepath_or_buffer,
                                         encoding=encoding,
                                         compression=compression)

    # Convert pathlib.Path/py.path.local or string
    filepath_or_buffer = _stringify_path(filepath_or_buffer)

    if isinstance(filepath_or_buffer, (compat.string_types,
                                       compat.binary_type,
                                       mmap.mmap)):
        return _expand_user(filepath_or_buffer), None, compression

    if not is_file_like(filepath_or_buffer):
        msg = "Invalid file path or buffer object type: {_type}"
        raise ValueError(msg.format(_type=type(filepath_or_buffer)))

    return filepath_or_buffer, None, compression
Esempio n. 33
0
File: common.py Progetto: esc/pandas
def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
    """
    If the filepath_or_buffer is a url, translate and return the buffer
    passthru otherwise.

    Parameters
    ----------
    filepath_or_buffer : a url, filepath, or buffer
    encoding : the encoding to use to decode py3 bytes, default is 'utf-8'

    Returns
    -------
    a filepath_or_buffer, the encoding
    """

    if _is_url(filepath_or_buffer):
        req = _urlopen(str(filepath_or_buffer))
        if compat.PY3:  # pragma: no cover
            if encoding:
                errors = 'strict'
            else:
                errors = 'replace'
                encoding = 'utf-8'
            out = StringIO(req.read().decode(encoding, errors))
        else:
            encoding = None
            out = req
        return out, encoding

    if _is_s3_url(filepath_or_buffer):
        try:
            import boto
        except:
            raise ImportError("boto is required to handle s3 files")
        # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
        # are environment variables
        parsed_url = parse_url(filepath_or_buffer)
        conn = boto.connect_s3()
        b = conn.get_bucket(parsed_url.netloc)
        k = boto.s3.key.Key(b)
        k.key = parsed_url.path
        filepath_or_buffer = StringIO(k.get_contents_as_string())
        return filepath_or_buffer, None

    return filepath_or_buffer, None
Esempio n. 34
0
def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
    """
    If the filepath_or_buffer is a url, translate and return the buffer
    passthru otherwise.

    Parameters
    ----------
    filepath_or_buffer : a url, filepath, or buffer
    encoding : the encoding to use to decode py3 bytes, default is 'utf-8'

    Returns
    -------
    a filepath_or_buffer, the encoding
    """

    if _is_url(filepath_or_buffer):
        req = _urlopen(str(filepath_or_buffer))
        if compat.PY3:  # pragma: no cover
            if encoding:
                errors = 'strict'
            else:
                errors = 'replace'
                encoding = 'utf-8'
            out = StringIO(req.read().decode(encoding, errors))
        else:
            encoding = None
            out = req
        return out, encoding

    if _is_s3_url(filepath_or_buffer):
        try:
            import boto
        except:
            raise ImportError("boto is required to handle s3 files")
        # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
        # are environment variables
        parsed_url = parse_url(filepath_or_buffer)
        conn = boto.connect_s3()
        b = conn.get_bucket(parsed_url.netloc)
        k = boto.s3.key.Key(b)
        k.key = parsed_url.path
        filepath_or_buffer = StringIO(k.get_contents_as_string())
        return filepath_or_buffer, None

    return filepath_or_buffer, None
    def _update_register(self):
        print 'Getting agencies from the federal register...'

        url_base = 'https://www.federalregister.gov/agencies'
        soup = _BeautifulSoup(_urlopen(url_base))
        links = soup.find_all(lambda x: x.name == 'li' and x.has_attr('data-filter-live') and not x.has_attr('class'))

        agency_dic = {}

        for link in links:
            agency = link.find('a')
            name_short = self._preprocess_name(agency.string)
            agency_dic[name_short] = {'html': agency,
                                      'url': agency['href'],
                                      'name_full': agency.string,
                                      'source': 'federal_register'}

            print agency.string

        self.agency_dictionary.update(agency_dic)
Esempio n. 36
0
def get_filepath_or_buffer(filepath_or_buffer,
                           encoding=None,
                           compression=None):
    """
    If the filepath_or_buffer is a url, translate and return the buffer
    passthru otherwise.

    Parameters
    ----------
    filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
                         or buffer
    encoding : the encoding to use to decode py3 bytes, default is 'utf-8'

    Returns
    -------
    a filepath_or_buffer, the encoding, the compression
    """

    if _is_url(filepath_or_buffer):
        req = _urlopen(str(filepath_or_buffer))
        if compression == 'infer':
            content_encoding = req.headers.get('Content-Encoding', None)
            if content_encoding == 'gzip':
                compression = 'gzip'
            else:
                compression = None
        # cat on the compression to the tuple returned by the function
        to_return = (
            list(maybe_read_encoded_stream(req, encoding, compression)) +
            [compression])
        return tuple(to_return)

    if _is_s3_url(filepath_or_buffer):
        from pandas.io.s3 import get_filepath_or_buffer
        return get_filepath_or_buffer(filepath_or_buffer,
                                      encoding=encoding,
                                      compression=compression)

    # It is a pathlib.Path/py.path.local or string
    filepath_or_buffer = _stringify_path(filepath_or_buffer)
    return _expand_user(filepath_or_buffer), None, compression
Esempio n. 37
0
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
                           compression=None):
    """
    If the filepath_or_buffer is a url, translate and return the buffer
    passthru otherwise.

    Parameters
    ----------
    filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
                         or buffer
    encoding : the encoding to use to decode py3 bytes, default is 'utf-8'

    Returns
    -------
    a filepath_or_buffer, the encoding, the compression
    """

    if _is_url(filepath_or_buffer):
        req = _urlopen(str(filepath_or_buffer))
        if compression == 'infer':
            content_encoding = req.headers.get('Content-Encoding', None)
            if content_encoding == 'gzip':
                compression = 'gzip'
            else:
                compression = None
        # cat on the compression to the tuple returned by the function
        to_return = (list(maybe_read_encoded_stream(req, encoding,
                                                    compression)) +
                     [compression])
        return tuple(to_return)

    if _is_s3_url(filepath_or_buffer):
        from pandas.io.s3 import get_filepath_or_buffer
        return get_filepath_or_buffer(filepath_or_buffer,
                                      encoding=encoding,
                                      compression=compression)

    # It is a pathlib.Path/py.path.local or string
    filepath_or_buffer = _stringify_path(filepath_or_buffer)
    return _expand_user(filepath_or_buffer), None, compression
Esempio n. 38
0
def get_filepath_or_buffer(filepath_or_buffer,
                           encoding=None,
                           compression=None):
    """
    If the filepath_or_buffer is a url, translate and return the buffer.
    Otherwise passthrough.

    Parameters
    ----------
    filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
                         or buffer
    encoding : the encoding to use to decode py3 bytes, default is 'utf-8'

    Returns
    -------
    a filepath_or_buffer, the encoding, the compression
    """

    if _is_url(filepath_or_buffer):
        url = str(filepath_or_buffer)
        req = _urlopen(url)
        content_encoding = req.headers.get('Content-Encoding', None)
        if content_encoding == 'gzip':
            # Override compression based on Content-Encoding header
            compression = 'gzip'
        reader = BytesIO(req.read())
        return reader, encoding, compression

    if _is_s3_url(filepath_or_buffer):
        from pandas.io import s3
        return s3.get_filepath_or_buffer(filepath_or_buffer,
                                         encoding=encoding,
                                         compression=compression)

    # It is a pathlib.Path/py.path.local or string
    filepath_or_buffer = _stringify_path(filepath_or_buffer)
    return _expand_user(filepath_or_buffer), None, compression
Esempio n. 39
0
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
                           compression=None):
    """
    If the filepath_or_buffer is a url, translate and return the buffer.
    Otherwise passthrough.

    Parameters
    ----------
    filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
                         or buffer
    encoding : the encoding to use to decode py3 bytes, default is 'utf-8'

    Returns
    -------
    a filepath_or_buffer, the encoding, the compression
    """

    if _is_url(filepath_or_buffer):
        url = str(filepath_or_buffer)
        req = _urlopen(url)
        content_encoding = req.headers.get('Content-Encoding', None)
        if content_encoding == 'gzip':
            # Override compression based on Content-Encoding header
            compression = 'gzip'
        reader = BytesIO(req.read())
        return reader, encoding, compression

    if _is_s3_url(filepath_or_buffer):
        from pandas.io import s3
        return s3.get_filepath_or_buffer(filepath_or_buffer,
                                         encoding=encoding,
                                         compression=compression)

    # It is a pathlib.Path/py.path.local or string
    filepath_or_buffer = _stringify_path(filepath_or_buffer)
    return _expand_user(filepath_or_buffer), None, compression
def get_newest_version(timeout=5, _url=__GLCREATE_CURRENT_VERSION_URL__):
    """
    Returns the version of GraphLab Create currently available from turi.com.
    Will raise an exception if we are unable to reach the turi.com servers.

    Parameters
    ----------
    timeout: int
        How many seconds to wait for the remote server to respond

    url: string
        The URL to go to to check the current version.

    Returns
    -------
    version : str
       The version number of the current graphlab create.
    """
    request = _urlopen(url=_url, timeout=timeout)
    version = request.read()
    if version:
        version = version.decode()
    __LOGGER__.debug("current_version read %s" % version)
    return version
Esempio n. 41
0
 def urlopen(*args, **kwargs):
     return closing(_urlopen(*args, **kwargs))
Esempio n. 42
0
 def urlopen(f):
     return closing(_urlopen(f))
Esempio n. 43
0
    def _get_ids(self):
        """ Note structure here is a little different than later classes - metadata and IDs retrieved at same time. """
        def get_meta(bill_content):
            """ Get metadata search results based on a given URL. """

            title = bill_content.BillTitle.find(name='Title',
                                                language='en').text

            if 'amend' in title.lower():
                amend = True
            else:
                amend = False

            # rarely, no published version of a bill is available
            publication_tags = [
                t for t in bill_content.find_all('Publication')
                if t.find(name='Title', language='en').text == 'Royal Assent'
            ]
            if len(publication_tags) == 1:
                publication_id = publication_tags[0]['id']
            else:
                publication_id = None

            # all other metadata appear to be consistently present
            date = bill_content.Events.LastMajorStageEvent.Event['date']
            session = bill_content.ParliamentSession['parliamentNumber']
            subtype = bill_content.BillType.find(name='Title',
                                                 language='en').text
            sponsor = bill_content.SponsorAffiliation.Person.FullName.text
            sponsor_party = bill_content.SponsorAffiliation.PoliticalParty.find(
                name='Title', language='en').text
            majority_party = bill_content.PrimeMinister.PoliticalParty.find(
                name='Title', language='en').text

            committee_tags = bill_content.find_all(name='Committee',
                                                   accronym=True)
            committee_names = [t['accronym'] for t in committee_tags]
            committee_data = {
                c: committee_names.count(c)
                for c in set(committee_names)
            }

            metadata = _format_meta_entry(country=u'canada',
                                          title=title,
                                          id=publication_id,
                                          date=date,
                                          session=session,
                                          type=u'annual',
                                          subtype=subtype,
                                          amendment=amend,
                                          sponsor=sponsor,
                                          sponsor_party=sponsor_party,
                                          majority_party=majority_party,
                                          hearings=committee_data)

            return metadata

        base_url = 'http://www.parl.gc.ca{0}'
        bill_types = [
            '/LegisInfo/Result.aspx?BillType=Senate%20Government%20Bill' +
            '&BillStatus=RoyalAssentGiven&Language=E&Mode=1',
            '/LegisInfo/Result.aspx?BillType=Private%20Member%E2%80%99s%20Bill'
            + '&BillStatus=RoyalAssentGiven&Language=E&Mode=1',
            '/LegisInfo/Result.aspx?BillType=House%20Government%20Bill' +
            '&BillStatus=RoyalAssentGiven&Language=E&Mode=1',
            '/LegisInfo/Result.aspx?BillType=Senate%20Public%20Bill'
        ]

        searches = []
        for bill_type in bill_types:
            search_content = _BeautifulSoup(
                _urlopen(base_url.format(bill_type)))
            sessions = [
                _re.sub('&Page=1', '&download=xml', tag['href'])
                for tag in search_content.find_all('a') if _re.search(
                    '[0-9]{2}-[0-9]\s*\([0-9]+\)', tag.text) is not None
            ]
            searches += sessions

        id_vals = []
        for s in searches:
            url = base_url.format(s)
            content = _BeautifulSoup(_urlopen(url).read(), features='xml')

            bills = content.find_all('Bill')
            for bill in bills:
                meta = get_meta(bill)

                if meta['id'] not in self.log_data['Annual']['Canada']:
                    id_vals.append(meta['id'])
                    self.data[meta['id']] = meta

        return id_vals
Esempio n. 44
0
    def _get_data(self, publication_id):
        import bs4

        search_term = _re.sub('_', '/', publication_id)

        text_soup = None
        text_content = None

        try:
            text_url = 'https://www.congress.gov/bill/{0}/text'.format(
                search_term)
            text_soup = _BeautifulSoup(_urlopen(text_url))
        except:
            pass

        if text_soup is not None:
            if text_soup.find('pre') is not None:
                text_content = str(text_soup.find('pre'))
            else:
                text_content = str(
                    text_soup.find('table',
                                   attrs={'class': 'lbexTableStyleEnr'}))

        meta_url = 'https://www.congress.gov/bill/{0}/all-info'.format(
            search_term)
        meta_soup = _BeautifulSoup(_urlopen(meta_url))

        title = _re.search(
            ': (.*)',
            meta_soup.find('meta', attrs={'name': 'dc.title'})['content'])
        if title is not None:
            title = title.group(1)

        date = meta_soup.find('meta', attrs={'name': 'dc.date'})['content']
        sponsor = meta_soup.find('meta', attrs={'name': 'dc.creator'})
        if sponsor is not None:
            sponsor = sponsor['content']

            sponsor_party = _re.search(sponsor + ' \[([A-Z])', meta_soup.text)
            if sponsor_party is not None:
                sponsor_party = sponsor_party.group(1)
        else:
            sponsor_party = None

        cosponsors = [
            tag.text for tag in meta_soup.find_all('a', href=True)
            if 'member/' in tag['href'] and sponsor not in tag.text
        ]

        policy_area = _re.search('Policy Area:\s*(.*)', meta_soup.text)
        if policy_area is not None:
            policy_area = policy_area.group(1)

        committee_entries = meta_soup.find_all('tr', class_='committee')
        referred = [entry.find('th').text for entry in committee_entries]
        hearings_held = []

        for entry in committee_entries:
            committee_name = entry.find('th').text
            actions = [entry.find_all('td')[1].text]

            entry = entry.next_sibling
            while type(entry) == bs4.element.Tag and (
                    'class' not in entry.attrs
                    or 'committee' not in entry['class']):
                actions.append(entry.find_all('td')[1].text)
                entry = entry.next_sibling

                if type(entry) == bs4.element.NavigableString:
                    break

            hearings = [action for action in actions if 'Hearing' in action]
            hearings_held += [committee_name] * len(hearings)

        if 'amend' in title:
            amendment = True
        else:
            amendment = False

        if 'resolution' in publication_id:
            subtype = u'resolution'
        else:
            subtype = u'law'

        meta = _format_meta_entry(country=u'united_states',
                                  title=title,
                                  id=publication_id,
                                  date=date,
                                  type=u'annual',
                                  subtype=subtype,
                                  amendment=amendment,
                                  sponsor=sponsor,
                                  sponsor_party=sponsor_party,
                                  cosponsors=cosponsors,
                                  referred=referred,
                                  hearings=hearings_held,
                                  policy_area=policy_area,
                                  html=text_content)

        return meta
Esempio n. 45
0
        def get_html(html_link):
            html_response = _urlopen(html_link)
            html_data = html_response.read()

            return html_data
Esempio n. 46
0
    def retrieve_pdb_file(self,pdb_code, obsolete=0, compression=None,
            uncompress=None, pdir=None):
        """ Retrieves a PDB structure file from the PDB server and
        stores it in a local file tree.
        The PDB structure is returned as a single string.
        If obsolete==1, the file will be saved in a special file tree.
        If uncompress is specified, a system utility will decompress the .gz
        archive. Otherwise, Python gzip utility will handle it.
        compression does nothing, as all archives are already in .gz format

        @param pdir: put the file in this directory (default: create a PDB-style directory tree) 
        @type pdir: string

        @return: filename
        @rtype: string
        """
        # Alert the user about deprecated parameters
        if compression is not None:
            warnings.warn("PDB file servers now only host .gz archives: "
                    "the compression parameter will not do anything"
                    , BiopythonDeprecationWarning)
        if uncompress is not None:
            warnings.warn("Decompression is handled with the gzip module: "
                    "the uncompression parameter will not do anything"
                    , BiopythonDeprecationWarning)

        # Get the structure
        code=pdb_code.lower()
        filename="pdb%s.ent.gz"%code
        if not obsolete:
            url=(self.pdb_server+
                 '/pub/pdb/data/structures/divided/pdb/%s/pdb%s.ent.gz'
                 % (code[1:3],code))
        else:
            url=(self.pdb_server+
                 '/pub/pdb/data/structures/obsolete/pdb/%s/pdb%s.ent.gz'
                 % (code[1:3],code))
            
        # In which dir to put the pdb file?
        if pdir is None:
            if self.flat_tree:
                if not obsolete:
                    path=self.local_pdb
                else:
                    path=self.obsolete_pdb
            else:
                # Put in PDB-style directory tree
                if not obsolete:
                    path=os.path.join(self.local_pdb, code[1:3])
                else:
                    path=os.path.join(self.obsolete_pdb,code[1:3])
        else:
            # Put in specified directory
            path=pdir
            
        if not os.access(path,os.F_OK):
            os.makedirs(path)
            
        filename=os.path.join(path, filename)
        # the final uncompressed file
        final_file=os.path.join(path, "pdb%s.ent" % code)

        # Skip download if the file already exists
        if not self.overwrite:
            if os.path.exists(final_file):
                print "Structure exists: '%s' " % final_file
                return final_file

        # Retrieve the file
        print "Downloading PDB structure '%s'..." % pdb_code
        lines = _urlopen(url).read()
        open(filename,'wb').write(lines)

        # Uncompress the file
        gz = gzip.open(filename, 'rb')
        out = open(final_file, 'wb')
        out.writelines(gz.read())
        gz.close()
        out.close()
        os.remove(filename)

        return final_file
Esempio n. 47
0
 def urlopen(*args, **kwargs):
     with closing(_urlopen(*args, **kwargs)) as f:
         yield f
def urlopen(id):
    key = choice(API_KEY)

    url = URL%(id.strip(), key)
    return _urlopen(url, timeout=60).read()
Esempio n. 49
0
 def urlopen(*args, **kwargs):
     with closing(_urlopen(*args, **kwargs)) as f:
         yield f
Esempio n. 50
0
    def retrieve_pdb_file(self, pdb_code, obsolete=0, compression=None, uncompress=None, pdir=None):
        """ Retrieves a PDB structure file from the PDB server and
        stores it in a local file tree.
        The PDB structure is returned as a single string.
        If obsolete==1, the file will be saved in a special file tree.
        If uncompress is specified, a system utility will decompress the .gz
        archive. Otherwise, Python gzip utility will handle it.
        compression does nothing, as all archives are already in .gz format

        @param pdir: put the file in this directory (default: create a PDB-style directory tree) 
        @type pdir: string

        @return: filename
        @rtype: string
        """
        # Alert the user about deprecated parameters
        if compression is not None:
            warnings.warn(
                "PDB file servers now only host .gz archives: " "the compression parameter will not do anything",
                BiopythonDeprecationWarning,
            )
        if uncompress is not None:
            warnings.warn(
                "Decompression is handled with the gzip module: " "the uncompression parameter will not do anything",
                BiopythonDeprecationWarning,
            )

        # Get the structure
        code = pdb_code.lower()
        filename = "pdb%s.ent.gz" % code
        if not obsolete:
            url = self.pdb_server + "/pub/pdb/data/structures/divided/pdb/%s/pdb%s.ent.gz" % (code[1:3], code)
        else:
            url = self.pdb_server + "/pub/pdb/data/structures/obsolete/pdb/%s/pdb%s.ent.gz" % (code[1:3], code)

        # In which dir to put the pdb file?
        if pdir is None:
            if self.flat_tree:
                if not obsolete:
                    path = self.local_pdb
                else:
                    path = self.obsolete_pdb
            else:
                # Put in PDB-style directory tree
                if not obsolete:
                    path = os.path.join(self.local_pdb, code[1:3])
                else:
                    path = os.path.join(self.obsolete_pdb, code[1:3])
        else:
            # Put in specified directory
            path = pdir

        if not os.access(path, os.F_OK):
            os.makedirs(path)

        filename = os.path.join(path, filename)
        # the final uncompressed file
        final_file = os.path.join(path, "pdb%s.ent" % code)

        # Skip download if the file already exists
        if not self.overwrite:
            if os.path.exists(final_file):
                print "Structure exists: '%s' " % final_file
                return final_file

        # Retrieve the file
        print "Downloading PDB structure '%s'..." % pdb_code
        lines = _urlopen(url).read()
        open(filename, "wb").write(lines)

        # Uncompress the file
        gz = gzip.open(filename, "rb")
        out = open(final_file, "wb")
        out.writelines(gz.read())
        gz.close()
        out.close()
        os.remove(filename)

        return final_file