def get_filepath_or_buffer(filepath_or_buffer, encoding=None): """ If the filepath_or_buffer is a url, translate and return the buffer passthru otherwise. Parameters ---------- filepath_or_buffer : a url, filepath, or buffer encoding : the encoding to use to decode py3 bytes, default is 'utf-8' Returns ------- a filepath_or_buffer, the encoding """ if _is_url(filepath_or_buffer): req = _urlopen(str(filepath_or_buffer)) return maybe_read_encoded_stream(req, encoding) if _is_s3_url(filepath_or_buffer): try: import boto except: raise ImportError("boto is required to handle s3 files") # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY # are environment variables parsed_url = parse_url(filepath_or_buffer) conn = boto.connect_s3() b = conn.get_bucket(parsed_url.netloc) k = boto.s3.key.Key(b) k.key = parsed_url.path filepath_or_buffer = StringIO(k.get_contents_as_string()) return filepath_or_buffer, None return filepath_or_buffer, None
def get_filepath_or_buffer(filepath_or_buffer, encoding=None): """ If the filepath_or_buffer is a url, translate and return the buffer passthru otherwise. Parameters ---------- filepath_or_buffer : a url, filepath, or buffer encoding : the encoding to use to decode py3 bytes, default is 'utf-8' Returns ------- a filepath_or_buffer, the encoding """ if _is_url(filepath_or_buffer): req = _urlopen(str(filepath_or_buffer)) return maybe_read_encoded_stream(req,encoding) if _is_s3_url(filepath_or_buffer): try: import boto except: raise ImportError("boto is required to handle s3 files") # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY # are environment variables parsed_url = parse_url(filepath_or_buffer) conn = boto.connect_s3() b = conn.get_bucket(parsed_url.netloc) k = boto.s3.key.Key(b) k.key = parsed_url.path filepath_or_buffer = StringIO(k.get_contents_as_string()) return filepath_or_buffer, None return filepath_or_buffer, None
def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None): """ If the filepath_or_buffer is a url, translate and return the buffer passthru otherwise. Parameters ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer encoding : the encoding to use to decode py3 bytes, default is 'utf-8' Returns ------- a filepath_or_buffer, the encoding, the compression """ if _is_url(filepath_or_buffer): req = _urlopen(str(filepath_or_buffer)) if compression == 'infer': content_encoding = req.headers.get('Content-Encoding', None) if content_encoding == 'gzip': compression = 'gzip' else: compression = None # cat on the compression to the tuple returned by the function to_return = list(maybe_read_encoded_stream(req, encoding, compression)) + \ [compression] return tuple(to_return) if _is_s3_url(filepath_or_buffer): try: import boto except: raise ImportError("boto is required to handle s3 files") # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY # are environment variables parsed_url = parse_url(filepath_or_buffer) try: conn = boto.connect_s3() except boto.exception.NoAuthHandlerFound: conn = boto.connect_s3(anon=True) b = conn.get_bucket(parsed_url.netloc, validate=False) if compat.PY2 and (compression == 'gzip' or (compression == 'infer' and filepath_or_buffer.endswith(".gz"))): k = boto.s3.key.Key(b, parsed_url.path) filepath_or_buffer = BytesIO(k.get_contents_as_string( encoding=encoding)) else: k = BotoFileLikeReader(b, parsed_url.path, encoding=encoding) k.open('r') # Expose read errors immediately filepath_or_buffer = k return filepath_or_buffer, None, compression # It is a pathlib.Path/py.path.local or string filepath_or_buffer = _stringify_path(filepath_or_buffer) return _expand_user(filepath_or_buffer), None, compression
def urlopen(url): sys.stdout.flush() url = url.replace('&max-results=20', '&max-results=100') if '&key' not in url: url += key print url return _urlopen(url, timeout=60).read()
def get_recent_changes(self): """Returns three lists of the newest weekly files (added,mod,obsolete). Reads the directories with changed entries from the PDB server and returns a tuple of three URL's to the files of new, modified and obsolete entries from the most recent list. The directory with the largest numerical name is used. Returns None if something goes wrong. Contents of the data/status dir (20031013 would be used); drwxrwxr-x 2 1002 sysadmin 512 Oct 6 18:28 20031006 drwxrwxr-x 2 1002 sysadmin 512 Oct 14 02:14 20031013 -rw-r--r-- 1 1002 sysadmin 1327 Mar 12 2001 README """ url = self.pdb_server + '/pub/pdb/data/status/' with contextlib.closing(_urlopen(url)) as handle: recent = filter(str.isdigit, (x.split()[-1] for x in handle.readlines()) )[-1] path = self.pdb_server + '/pub/pdb/data/status/%s/' % (recent) # Retrieve the lists added = self.get_status_list(path + 'added.pdb') modified = self.get_status_list(path + 'modified.pdb') obsolete = self.get_status_list(path + 'obsolete.pdb') return [added, modified, obsolete]
def get_all_obsolete(self): """Returns a list of all obsolete entries ever in the PDB. Returns a list of all obsolete pdb codes that have ever been in the PDB. Gets and parses the file from the PDB server in the format (the first pdb_code column is the one used). The file looks like this: LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS OBSLTE 31-JUL-94 116L 216L ... OBSLTE 29-JAN-96 1HFT 2HFT OBSLTE 21-SEP-06 1HFV 2J5X OBSLTE 21-NOV-03 1HG6 OBSLTE 18-JUL-84 1HHB 2HHB 3HHB OBSLTE 08-NOV-96 1HID 2HID OBSLTE 01-APR-97 1HIU 2HIU OBSLTE 14-JAN-04 1HKE 1UUZ ... """ url = self.pdb_server + '/pub/pdb/data/status/obsolete.dat' with contextlib.closing(_urlopen(url)) as handle: # Extract pdb codes. Could use a list comprehension, but I want # to include an assert to check for mis-reading the data. obsolete = [] for line in handle: if not line.startswith("OBSLTE "): continue pdb = line.split()[2] assert len(pdb) == 4 obsolete.append(pdb) return obsolete
def _update_usa(self): """ Update whitelist based on usa.gov """ print 'Getting agencies from usa.gov...' url_base = 'https://www.usa.gov' letters = _string.ascii_lowercase agency_dic = {} for letter in letters: url = url_base + '/federal-agencies/' + letter soup = _BeautifulSoup(_urlopen(url).read()) links_content = [l for l in soup.find_all('ul') if 'class' in l.attrs and 'one_column_bullet' in l['class']] if len(links_content) == 1: links_list = links_content[0].find_all('a') for agency_html in links_list: name_short = self._preprocess_name(agency_html.string) agency_dic[name_short] = {'html': agency_html, 'url': url_base + agency_html['href'], 'name_full': agency_html.string, 'source': 'usa.gov'} print agency_html.string elif len(links_content) == 0: pass else: raise ValueError('Too many list elements found! Please modify the HTML parser.') self.agency_dictionary.update(agency_dic)
def get_recent_changes(self): """Returns three lists of the newest weekly files (added,mod,obsolete). Reads the directories with changed entries from the PDB server and returns a tuple of three URL's to the files of new, modified and obsolete entries from the most recent list. The directory with the largest numerical name is used. Returns None if something goes wrong. Contents of the data/status dir (20031013 would be used); drwxrwxr-x 2 1002 sysadmin 512 Oct 6 18:28 20031006 drwxrwxr-x 2 1002 sysadmin 512 Oct 14 02:14 20031013 -rw-r--r-- 1 1002 sysadmin 1327 Mar 12 2001 README """ url = self.pdb_server + '/pub/pdb/data/status/' with contextlib.closing(_urlopen(url)) as handle: recent = filter(str.isdigit, (x.split()[-1] for x in handle.readlines()))[-1] path = self.pdb_server + '/pub/pdb/data/status/%s/' % (recent) # Retrieve the lists added = self.get_status_list(path + 'added.pdb') modified = self.get_status_list(path + 'modified.pdb') obsolete = self.get_status_list(path + 'obsolete.pdb') return [added, modified, obsolete]
def urlopen(url, data=None, lang='en'): request = Request(url, data, { "Accept-Language": "%s,en-us;q=0.7,en;q=0.3"%lang.lower(), "User-Agent": UA, }) logging.debug("urlopen: %s", url) time.sleep(URLOPEN_DELAY) return _urlopen(request)
def get_all_entries(self): """Retrieves a big file containing all the PDB entries and some annotation to them. Returns a list of PDB codes in the index file. """ print "retrieving index file. Takes about 5 MB." url = _urlopen(self.pdb_server + "/pub/pdb/derived_data/index/entries.idx") return [line[:4] for line in url.readlines()[2:] if len(line) > 4]
def fetch_film_info_from_criticker(film_data): url = 'http://www.criticker.com/?f=' + film_data['criticker_id'] title_page = None try: page = unicode(_urlopen(url, None, 5).read(), 'iso-8859-1') soup = BeautifulSoup(page) title_page = soup.find("div", attrs={"id":"fi_info_filmname"}) except URLError, e: logger.error("URL Error: " + str(e.reason) + ": " + url)
def get_all_entries(self): """Retrieves a big file containing all the PDB entries and some annotation to them. Returns a list of PDB codes in the index file. """ print "retrieving index file. Takes about 5 MB." url = _urlopen(self.pdb_server + '/pub/pdb/derived_data/index/entries.idx') return [line[:4] for line in url.readlines()[2:] if len(line) > 4]
def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None, mode=None): """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. Parameters ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer encoding : the encoding to use to decode py3 bytes, default is 'utf-8' mode : str, optional Returns ------- tuple of ({a filepath_ or buffer or S3File instance}, encoding, str, compression, str, should_close, bool) """ filepath_or_buffer = _stringify_path(filepath_or_buffer) if _is_url(filepath_or_buffer): req = _urlopen(filepath_or_buffer) content_encoding = req.headers.get('Content-Encoding', None) if content_encoding == 'gzip': # Override compression based on Content-Encoding header compression = 'gzip' reader = BytesIO(req.read()) req.close() return reader, encoding, compression, True if is_s3_url(filepath_or_buffer): from pandas.io import s3 return s3.get_filepath_or_buffer(filepath_or_buffer, encoding=encoding, compression=compression, mode=mode) if is_gcs_url(filepath_or_buffer): from pandas.io import gcs return gcs.get_filepath_or_buffer(filepath_or_buffer, encoding=encoding, compression=compression, mode=mode) if isinstance(filepath_or_buffer, (compat.string_types, compat.binary_type, mmap.mmap)): return _expand_user(filepath_or_buffer), None, compression, False if not is_file_like(filepath_or_buffer): msg = "Invalid file path or buffer object type: {_type}" raise ValueError(msg.format(_type=type(filepath_or_buffer))) return filepath_or_buffer, None, compression, False
def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None): """ If the filepath_or_buffer is a url, translate and return the buffer passthru otherwise. Parameters ---------- filepath_or_buffer : a url, filepath, or buffer encoding : the encoding to use to decode py3 bytes, default is 'utf-8' Returns ------- a filepath_or_buffer, the encoding, the compression """ if _is_url(filepath_or_buffer): req = _urlopen(str(filepath_or_buffer)) if compression == 'infer': content_encoding = req.headers.get('Content-Encoding', None) if content_encoding == 'gzip': compression = 'gzip' else: compression = None # cat on the compression to the tuple returned by the function to_return = list(maybe_read_encoded_stream(req, encoding, compression)) + \ [compression] return tuple(to_return) if _is_s3_url(filepath_or_buffer): try: import boto except: raise ImportError("boto is required to handle s3 files") # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY # are environment variables parsed_url = parse_url(filepath_or_buffer) try: conn = boto.connect_s3() except boto.exception.NoAuthHandlerFound: conn = boto.connect_s3(anon=True) b = conn.get_bucket(parsed_url.netloc, validate=False) if compat.PY2 and (compression == 'gzip' or (compression == 'infer' and filepath_or_buffer.endswith(".gz"))): k = boto.s3.key.Key(b, parsed_url.path) filepath_or_buffer = BytesIO(k.get_contents_as_string( encoding=encoding)) else: k = BotoFileLikeReader(b, parsed_url.path, encoding=encoding) k.open('r') # Expose read errors immediately filepath_or_buffer = k return filepath_or_buffer, None, compression return _expand_user(filepath_or_buffer), None, compression
def load(filename): if filename[:4] == 'http': resp = _urlopen(filename) dict = _json.loads(resp.read(), object_pairs_hook=_parse_json) else: filename = os.path.expanduser(filename) f = open(filename, 'r') dict = _json.load(f, object_pairs_hook=_parse_json) f.close() return dict
def get_all_entries(self): """Retrieves a big file containing all the PDB entries and some annotation to them. Returns a list of PDB codes in the index file. """ print("retrieving index file. Takes about 5 MB.") url = self.pdb_server + '/pub/pdb/derived_data/index/entries.idx' with contextlib.closing(_urlopen(url)) as handle: all_entries = [line[:4] for line in handle.readlines()[2:] if len(line) > 4] return all_entries
def get_seqres_file(self, savefile="pdb_seqres.txt"): """Retrieves a (big) file containing all the sequences of PDB entries and writes it to a file. """ print "retrieving sequence file. Takes about 15 MB." handle = _urlopen(self.pdb_server + "/pub/pdb/derived_data/pdb_seqres.txt") lines = handle.readlines() outfile = open(savefile, "w") outfile.writelines(lines) outfile.close() handle.close()
def _update_wikipedia(self): # do a little bit of name preprocessing here too from requests import ConnectionError from wikipedia import PageError print 'Getting data from Wikipedia...' page_current = _wikipedia.page('List_of_federal_agencies_in_the_United_States') html = page_current.html() subset = html[_re.search('<h2>.*?Legislative Branch', html).start():_re.search('<h2>.*?See also', html).start()] soup = _BeautifulSoup(subset) links = soup.find_all(lambda x: x.name == 'a' and x.has_attr('href') and '/wiki/' in x['href'] and 'File:' not in x['href']) agency_dic = {self._preprocess_name(link['title']): {'html': link, 'url': 'https://en.wikipedia.org' + link['href'], 'name_full': link['title'], 'source': 'wikipedia'} for link in links} category_pages = ['https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&' + 'cmtitle=Category:Defunct_agencies_of_the_United_States_government&cmlimit=500&format=json', 'https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&' + 'cmtitle=Category:Committees_of_the_United_States_Senate&cmlimit=500&format=json', 'https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&' + 'cmtitle=Category:Joint_committees_of_the_United_States_Congress' + '&cmlimit=500&format=json', 'https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&' + 'cmtitle=Category:Committees_of_the_United_States_House_of_Representatives' + '&cmlimit=500&format=json', ] for category_page in category_pages: content_defunct = _json.loads(_urlopen(category_page).read()) for result in content_defunct['query']['categorymembers']: if result['ns'] == 0: url_defunct = 'https://en.wikipedia.org/wiki/' + _re.sub(' ', '_', result['title']) print(result['title']) try: page_defunct = _wikipedia.page(result['title']) name_short = self._preprocess_name(result['title']) agency_dic[name_short] = {'html': page_defunct.html(), 'url': url_defunct, 'name_full': result['title'], 'source': 'wikipedia'} except (ConnectionError, PageError): print('Failed to get agency HTML!') self.agency_dictionary.update(agency_dic)
def urlopen(url, data=None, *args, **kwargs): if not isinstance(url, Request): url = Request(url, data) data = None if 'basic_auth' in kwargs: if kwargs['basic_auth']: a = base64.b64encode(':'.join(kwargs['basic_auth'])) url.add_header('Authorization', 'Basic '+a) del(kwargs['basic_auth']) if 'authorization' in kwargs: if kwargs['authorization']: url.add_header('Authorization', kwargs['authorization']) del(kwargs['authorization']) if sys.version_info[0] == 2: url.add_header('Host', url.get_origin_req_host()) return _urlopen(url, data, *args, **kwargs) else: url.add_header('Host', url.origin_req_host) kwargs['cadefaults'] = True return _urlopen(url, data, *args, **kwargs)
def get_xml(xml_link): try: xml_data = _urlopen(xml_link).read() if 'xml' in xml_data[0:100]: return xml_data else: return None except HTTPError: return None
def get_seqres_file(self,savefile='pdb_seqres.txt'): """Retrieves a (big) file containing all the sequences of PDB entries and writes it to a file. """ print "retrieving sequence file. Takes about 15 MB." handle = _urlopen(self.pdb_server + '/pub/pdb/derived_data/pdb_seqres.txt') lines = handle.readlines() outfile = open(savefile, 'w') outfile.writelines(lines) outfile.close() handle.close()
def get_all_entries(self): """Retrieves a big file containing all the PDB entries and some annotation to them. Returns a list of PDB codes in the index file. """ print("retrieving index file. Takes about 5 MB.") url = self.pdb_server + '/pub/pdb/derived_data/index/entries.idx' with contextlib.closing(_urlopen(url)) as handle: all_entries = [ line[:4] for line in handle.readlines()[2:] if len(line) > 4 ] return all_entries
def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None): """ If the filepath_or_buffer is a url, translate and return the buffer passthru otherwise. Parameters ---------- filepath_or_buffer : a url, filepath, or buffer encoding : the encoding to use to decode py3 bytes, default is 'utf-8' Returns ------- a filepath_or_buffer, the encoding, the compression """ if _is_url(filepath_or_buffer): req = _urlopen(str(filepath_or_buffer)) if compression == 'infer': content_encoding = req.headers.get('Content-Encoding', None) if content_encoding == 'gzip': compression = 'gzip' # cat on the compression to the tuple returned by the function to_return = list(maybe_read_encoded_stream(req, encoding, compression)) + \ [compression] return tuple(to_return) if _is_s3_url(filepath_or_buffer): try: import boto except: raise ImportError("boto is required to handle s3 files") # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY # are environment variables parsed_url = parse_url(filepath_or_buffer) try: conn = boto.connect_s3() except boto.exception.NoAuthHandlerFound: conn = boto.connect_s3(anon=True) b = conn.get_bucket(parsed_url.netloc, validate=False) k = boto.s3.key.Key(b) k.key = parsed_url.path filepath_or_buffer = BytesIO( k.get_contents_as_string(encoding=encoding)) return filepath_or_buffer, None, compression return _expand_user(filepath_or_buffer), None, compression
def get_status_list(self, url): """Retrieves a list of pdb codes in the weekly pdb status file from the given URL. Used by get_recent_files. Typical contents of the list files parsed by this method is now very simply one PDB name per line. """ with contextlib.closing(_urlopen(url)) as handle: answer = [] for line in handle: pdb = line.strip() assert len(pdb) == 4 answer.append(pdb) return answer
def _get_ids(self): id_vals = [] years = range(1988, 2017) for year in years: soup = _BeautifulSoup( _urlopen( 'http://www.legislation.gov.uk/ukpga/{0}'.format(year))) n_results = _re.search('has returned ([0-9]+) results', soup.text.lower()).group(1) id_vals += [ str(year) + '_' + str(i) for i in range(1, int(n_results) + 1) ] return id_vals
def _get_data(self, publication_id): max_attempts = 10 attempts = 0 xml_content = None soup = None while attempts < max_attempts: search_id = _re.sub('_', '/', publication_id) try: xml_content = _urlopen( 'http://www.legislation.gov.uk/ukpga/{0}/data.xml'.format( search_id)).read() soup = _BeautifulSoup(xml_content, 'xml') break except: attempts += 1 if 'amendment' in soup.title.text.lower(): amend = True else: amend = False if 'repeal' in soup.title.text.lower(): repeal = True else: repeal = False if soup.EnactmentDate is not None: date = soup.EnactmentDate['Date'] elif soup.PrimaryPrelims is not None: date = soup.PrimaryPrelims['RestrictStartDate'] else: date = None print 'warning! No date found.' meta = _format_meta_entry(country=u'united_kingdom', title=soup.title.text, id=publication_id, date=date, type=u'annual', xml=xml_content, amendment=amend, repealed=repeal) return meta
def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None): """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. Parameters ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer encoding : the encoding to use to decode py3 bytes, default is 'utf-8' Returns ------- a filepath_or_buffer, the encoding, the compression """ if _is_url(filepath_or_buffer): url = str(filepath_or_buffer) req = _urlopen(url) content_encoding = req.headers.get('Content-Encoding', None) if content_encoding == 'gzip': # Override compression based on Content-Encoding header compression = 'gzip' reader = BytesIO(req.read()) return reader, encoding, compression if _is_s3_url(filepath_or_buffer): from pandas.io import s3 return s3.get_filepath_or_buffer(filepath_or_buffer, encoding=encoding, compression=compression) # Convert pathlib.Path/py.path.local or string filepath_or_buffer = _stringify_path(filepath_or_buffer) if isinstance(filepath_or_buffer, (compat.string_types, compat.binary_type, mmap.mmap)): return _expand_user(filepath_or_buffer), None, compression if not is_file_like(filepath_or_buffer): msg = "Invalid file path or buffer object type: {_type}" raise ValueError(msg.format(_type=type(filepath_or_buffer))) return filepath_or_buffer, None, compression
def get_filepath_or_buffer(filepath_or_buffer, encoding=None): """ If the filepath_or_buffer is a url, translate and return the buffer passthru otherwise. Parameters ---------- filepath_or_buffer : a url, filepath, or buffer encoding : the encoding to use to decode py3 bytes, default is 'utf-8' Returns ------- a filepath_or_buffer, the encoding """ if _is_url(filepath_or_buffer): req = _urlopen(str(filepath_or_buffer)) if compat.PY3: # pragma: no cover if encoding: errors = 'strict' else: errors = 'replace' encoding = 'utf-8' out = StringIO(req.read().decode(encoding, errors)) else: encoding = None out = req return out, encoding if _is_s3_url(filepath_or_buffer): try: import boto except: raise ImportError("boto is required to handle s3 files") # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY # are environment variables parsed_url = parse_url(filepath_or_buffer) conn = boto.connect_s3() b = conn.get_bucket(parsed_url.netloc) k = boto.s3.key.Key(b) k.key = parsed_url.path filepath_or_buffer = StringIO(k.get_contents_as_string()) return filepath_or_buffer, None return filepath_or_buffer, None
def _update_register(self): print 'Getting agencies from the federal register...' url_base = 'https://www.federalregister.gov/agencies' soup = _BeautifulSoup(_urlopen(url_base)) links = soup.find_all(lambda x: x.name == 'li' and x.has_attr('data-filter-live') and not x.has_attr('class')) agency_dic = {} for link in links: agency = link.find('a') name_short = self._preprocess_name(agency.string) agency_dic[name_short] = {'html': agency, 'url': agency['href'], 'name_full': agency.string, 'source': 'federal_register'} print agency.string self.agency_dictionary.update(agency_dic)
def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None): """ If the filepath_or_buffer is a url, translate and return the buffer passthru otherwise. Parameters ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer encoding : the encoding to use to decode py3 bytes, default is 'utf-8' Returns ------- a filepath_or_buffer, the encoding, the compression """ if _is_url(filepath_or_buffer): req = _urlopen(str(filepath_or_buffer)) if compression == 'infer': content_encoding = req.headers.get('Content-Encoding', None) if content_encoding == 'gzip': compression = 'gzip' else: compression = None # cat on the compression to the tuple returned by the function to_return = ( list(maybe_read_encoded_stream(req, encoding, compression)) + [compression]) return tuple(to_return) if _is_s3_url(filepath_or_buffer): from pandas.io.s3 import get_filepath_or_buffer return get_filepath_or_buffer(filepath_or_buffer, encoding=encoding, compression=compression) # It is a pathlib.Path/py.path.local or string filepath_or_buffer = _stringify_path(filepath_or_buffer) return _expand_user(filepath_or_buffer), None, compression
def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None): """ If the filepath_or_buffer is a url, translate and return the buffer passthru otherwise. Parameters ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer encoding : the encoding to use to decode py3 bytes, default is 'utf-8' Returns ------- a filepath_or_buffer, the encoding, the compression """ if _is_url(filepath_or_buffer): req = _urlopen(str(filepath_or_buffer)) if compression == 'infer': content_encoding = req.headers.get('Content-Encoding', None) if content_encoding == 'gzip': compression = 'gzip' else: compression = None # cat on the compression to the tuple returned by the function to_return = (list(maybe_read_encoded_stream(req, encoding, compression)) + [compression]) return tuple(to_return) if _is_s3_url(filepath_or_buffer): from pandas.io.s3 import get_filepath_or_buffer return get_filepath_or_buffer(filepath_or_buffer, encoding=encoding, compression=compression) # It is a pathlib.Path/py.path.local or string filepath_or_buffer = _stringify_path(filepath_or_buffer) return _expand_user(filepath_or_buffer), None, compression
def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None): """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. Parameters ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer encoding : the encoding to use to decode py3 bytes, default is 'utf-8' Returns ------- a filepath_or_buffer, the encoding, the compression """ if _is_url(filepath_or_buffer): url = str(filepath_or_buffer) req = _urlopen(url) content_encoding = req.headers.get('Content-Encoding', None) if content_encoding == 'gzip': # Override compression based on Content-Encoding header compression = 'gzip' reader = BytesIO(req.read()) return reader, encoding, compression if _is_s3_url(filepath_or_buffer): from pandas.io import s3 return s3.get_filepath_or_buffer(filepath_or_buffer, encoding=encoding, compression=compression) # It is a pathlib.Path/py.path.local or string filepath_or_buffer = _stringify_path(filepath_or_buffer) return _expand_user(filepath_or_buffer), None, compression
def get_newest_version(timeout=5, _url=__GLCREATE_CURRENT_VERSION_URL__): """ Returns the version of GraphLab Create currently available from turi.com. Will raise an exception if we are unable to reach the turi.com servers. Parameters ---------- timeout: int How many seconds to wait for the remote server to respond url: string The URL to go to to check the current version. Returns ------- version : str The version number of the current graphlab create. """ request = _urlopen(url=_url, timeout=timeout) version = request.read() if version: version = version.decode() __LOGGER__.debug("current_version read %s" % version) return version
def urlopen(*args, **kwargs): return closing(_urlopen(*args, **kwargs))
def urlopen(f): return closing(_urlopen(f))
def _get_ids(self): """ Note structure here is a little different than later classes - metadata and IDs retrieved at same time. """ def get_meta(bill_content): """ Get metadata search results based on a given URL. """ title = bill_content.BillTitle.find(name='Title', language='en').text if 'amend' in title.lower(): amend = True else: amend = False # rarely, no published version of a bill is available publication_tags = [ t for t in bill_content.find_all('Publication') if t.find(name='Title', language='en').text == 'Royal Assent' ] if len(publication_tags) == 1: publication_id = publication_tags[0]['id'] else: publication_id = None # all other metadata appear to be consistently present date = bill_content.Events.LastMajorStageEvent.Event['date'] session = bill_content.ParliamentSession['parliamentNumber'] subtype = bill_content.BillType.find(name='Title', language='en').text sponsor = bill_content.SponsorAffiliation.Person.FullName.text sponsor_party = bill_content.SponsorAffiliation.PoliticalParty.find( name='Title', language='en').text majority_party = bill_content.PrimeMinister.PoliticalParty.find( name='Title', language='en').text committee_tags = bill_content.find_all(name='Committee', accronym=True) committee_names = [t['accronym'] for t in committee_tags] committee_data = { c: committee_names.count(c) for c in set(committee_names) } metadata = _format_meta_entry(country=u'canada', title=title, id=publication_id, date=date, session=session, type=u'annual', subtype=subtype, amendment=amend, sponsor=sponsor, sponsor_party=sponsor_party, majority_party=majority_party, hearings=committee_data) return metadata base_url = 'http://www.parl.gc.ca{0}' bill_types = [ '/LegisInfo/Result.aspx?BillType=Senate%20Government%20Bill' + '&BillStatus=RoyalAssentGiven&Language=E&Mode=1', '/LegisInfo/Result.aspx?BillType=Private%20Member%E2%80%99s%20Bill' + '&BillStatus=RoyalAssentGiven&Language=E&Mode=1', '/LegisInfo/Result.aspx?BillType=House%20Government%20Bill' + '&BillStatus=RoyalAssentGiven&Language=E&Mode=1', '/LegisInfo/Result.aspx?BillType=Senate%20Public%20Bill' ] searches = [] for bill_type in bill_types: search_content = _BeautifulSoup( _urlopen(base_url.format(bill_type))) sessions = [ _re.sub('&Page=1', '&download=xml', tag['href']) for tag in search_content.find_all('a') if _re.search( '[0-9]{2}-[0-9]\s*\([0-9]+\)', tag.text) is not None ] searches += sessions id_vals = [] for s in searches: url = base_url.format(s) content = _BeautifulSoup(_urlopen(url).read(), features='xml') bills = content.find_all('Bill') for bill in bills: meta = get_meta(bill) if meta['id'] not in self.log_data['Annual']['Canada']: id_vals.append(meta['id']) self.data[meta['id']] = meta return id_vals
def _get_data(self, publication_id): import bs4 search_term = _re.sub('_', '/', publication_id) text_soup = None text_content = None try: text_url = 'https://www.congress.gov/bill/{0}/text'.format( search_term) text_soup = _BeautifulSoup(_urlopen(text_url)) except: pass if text_soup is not None: if text_soup.find('pre') is not None: text_content = str(text_soup.find('pre')) else: text_content = str( text_soup.find('table', attrs={'class': 'lbexTableStyleEnr'})) meta_url = 'https://www.congress.gov/bill/{0}/all-info'.format( search_term) meta_soup = _BeautifulSoup(_urlopen(meta_url)) title = _re.search( ': (.*)', meta_soup.find('meta', attrs={'name': 'dc.title'})['content']) if title is not None: title = title.group(1) date = meta_soup.find('meta', attrs={'name': 'dc.date'})['content'] sponsor = meta_soup.find('meta', attrs={'name': 'dc.creator'}) if sponsor is not None: sponsor = sponsor['content'] sponsor_party = _re.search(sponsor + ' \[([A-Z])', meta_soup.text) if sponsor_party is not None: sponsor_party = sponsor_party.group(1) else: sponsor_party = None cosponsors = [ tag.text for tag in meta_soup.find_all('a', href=True) if 'member/' in tag['href'] and sponsor not in tag.text ] policy_area = _re.search('Policy Area:\s*(.*)', meta_soup.text) if policy_area is not None: policy_area = policy_area.group(1) committee_entries = meta_soup.find_all('tr', class_='committee') referred = [entry.find('th').text for entry in committee_entries] hearings_held = [] for entry in committee_entries: committee_name = entry.find('th').text actions = [entry.find_all('td')[1].text] entry = entry.next_sibling while type(entry) == bs4.element.Tag and ( 'class' not in entry.attrs or 'committee' not in entry['class']): actions.append(entry.find_all('td')[1].text) entry = entry.next_sibling if type(entry) == bs4.element.NavigableString: break hearings = [action for action in actions if 'Hearing' in action] hearings_held += [committee_name] * len(hearings) if 'amend' in title: amendment = True else: amendment = False if 'resolution' in publication_id: subtype = u'resolution' else: subtype = u'law' meta = _format_meta_entry(country=u'united_states', title=title, id=publication_id, date=date, type=u'annual', subtype=subtype, amendment=amendment, sponsor=sponsor, sponsor_party=sponsor_party, cosponsors=cosponsors, referred=referred, hearings=hearings_held, policy_area=policy_area, html=text_content) return meta
def get_html(html_link): html_response = _urlopen(html_link) html_data = html_response.read() return html_data
def retrieve_pdb_file(self,pdb_code, obsolete=0, compression=None, uncompress=None, pdir=None): """ Retrieves a PDB structure file from the PDB server and stores it in a local file tree. The PDB structure is returned as a single string. If obsolete==1, the file will be saved in a special file tree. If uncompress is specified, a system utility will decompress the .gz archive. Otherwise, Python gzip utility will handle it. compression does nothing, as all archives are already in .gz format @param pdir: put the file in this directory (default: create a PDB-style directory tree) @type pdir: string @return: filename @rtype: string """ # Alert the user about deprecated parameters if compression is not None: warnings.warn("PDB file servers now only host .gz archives: " "the compression parameter will not do anything" , BiopythonDeprecationWarning) if uncompress is not None: warnings.warn("Decompression is handled with the gzip module: " "the uncompression parameter will not do anything" , BiopythonDeprecationWarning) # Get the structure code=pdb_code.lower() filename="pdb%s.ent.gz"%code if not obsolete: url=(self.pdb_server+ '/pub/pdb/data/structures/divided/pdb/%s/pdb%s.ent.gz' % (code[1:3],code)) else: url=(self.pdb_server+ '/pub/pdb/data/structures/obsolete/pdb/%s/pdb%s.ent.gz' % (code[1:3],code)) # In which dir to put the pdb file? if pdir is None: if self.flat_tree: if not obsolete: path=self.local_pdb else: path=self.obsolete_pdb else: # Put in PDB-style directory tree if not obsolete: path=os.path.join(self.local_pdb, code[1:3]) else: path=os.path.join(self.obsolete_pdb,code[1:3]) else: # Put in specified directory path=pdir if not os.access(path,os.F_OK): os.makedirs(path) filename=os.path.join(path, filename) # the final uncompressed file final_file=os.path.join(path, "pdb%s.ent" % code) # Skip download if the file already exists if not self.overwrite: if os.path.exists(final_file): print "Structure exists: '%s' " % final_file return final_file # Retrieve the file print "Downloading PDB structure '%s'..." % pdb_code lines = _urlopen(url).read() open(filename,'wb').write(lines) # Uncompress the file gz = gzip.open(filename, 'rb') out = open(final_file, 'wb') out.writelines(gz.read()) gz.close() out.close() os.remove(filename) return final_file
def urlopen(*args, **kwargs): with closing(_urlopen(*args, **kwargs)) as f: yield f
def urlopen(id): key = choice(API_KEY) url = URL%(id.strip(), key) return _urlopen(url, timeout=60).read()
def retrieve_pdb_file(self, pdb_code, obsolete=0, compression=None, uncompress=None, pdir=None): """ Retrieves a PDB structure file from the PDB server and stores it in a local file tree. The PDB structure is returned as a single string. If obsolete==1, the file will be saved in a special file tree. If uncompress is specified, a system utility will decompress the .gz archive. Otherwise, Python gzip utility will handle it. compression does nothing, as all archives are already in .gz format @param pdir: put the file in this directory (default: create a PDB-style directory tree) @type pdir: string @return: filename @rtype: string """ # Alert the user about deprecated parameters if compression is not None: warnings.warn( "PDB file servers now only host .gz archives: " "the compression parameter will not do anything", BiopythonDeprecationWarning, ) if uncompress is not None: warnings.warn( "Decompression is handled with the gzip module: " "the uncompression parameter will not do anything", BiopythonDeprecationWarning, ) # Get the structure code = pdb_code.lower() filename = "pdb%s.ent.gz" % code if not obsolete: url = self.pdb_server + "/pub/pdb/data/structures/divided/pdb/%s/pdb%s.ent.gz" % (code[1:3], code) else: url = self.pdb_server + "/pub/pdb/data/structures/obsolete/pdb/%s/pdb%s.ent.gz" % (code[1:3], code) # In which dir to put the pdb file? if pdir is None: if self.flat_tree: if not obsolete: path = self.local_pdb else: path = self.obsolete_pdb else: # Put in PDB-style directory tree if not obsolete: path = os.path.join(self.local_pdb, code[1:3]) else: path = os.path.join(self.obsolete_pdb, code[1:3]) else: # Put in specified directory path = pdir if not os.access(path, os.F_OK): os.makedirs(path) filename = os.path.join(path, filename) # the final uncompressed file final_file = os.path.join(path, "pdb%s.ent" % code) # Skip download if the file already exists if not self.overwrite: if os.path.exists(final_file): print "Structure exists: '%s' " % final_file return final_file # Retrieve the file print "Downloading PDB structure '%s'..." % pdb_code lines = _urlopen(url).read() open(filename, "wb").write(lines) # Uncompress the file gz = gzip.open(filename, "rb") out = open(final_file, "wb") out.writelines(gz.read()) gz.close() out.close() os.remove(filename) return final_file