def test_filepath(self): r = Cache() base = 'http://www.abc.org/' in1 = base + 'xyz' out = r.filepath(in1) # ./xyz assert out.endswith('xyz'), out
def test_cache(self): cache = os.path.join(self.tmp, 'cache') r = Cache(cache) r.retrieve(self.url) assert os.path.exists(os.path.join(cache, 'abc.txt'))
import os import re import BeautifulSoup as bs import json from swiss.cache import Cache cache = os.path.join(os.path.dirname(__file__), 'cache') DATAPATH = os.path.join(os.path.dirname(__file__), 'data') europarl_url = 'http://www.europarl.europa.eu' juri_url = 'http://www.europarl.europa.eu/activities/committees/membersCom.do?body=JURI' itre_url = 'http://www.europarl.europa.eu/activities/committees/membersCom.do?body=ITRE' member_base_url = 'http://www.europarl.europa.eu/members/expert/committees/view.do' retriever = Cache(cache) infopath = os.path.join(cache, 'info.js') # from http://effbot.org/zone/re-sub.htm#unescape-html import re, htmlentitydefs def unescape(text): def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else:
'''Extract COFOG codes into usable (csv) form. TODO: do other languages (French, Spanish, Russian) TODO: footnotes about CS and IS ... ''' import csv import os import zipfile import commands from StringIO import StringIO from swiss.cache import Cache cache_path = os.path.join(os.path.dirname(__file__), 'cache') cache = Cache(cache_path) access_db_zip_url = 'http://unstats.un.org/unsd/cr/registry/regdntransfer.asp?f=186' details_table_name = 'tblTitles_English_COFOG' db_filename = 'COFOG_english.mdb' db_filepath = cache.cache_path(db_filename) def retrieve(): '''Retrieve remove files into local cache. ''' fp = cache.retrieve(access_db_zip_url) zipfo = zipfile.ZipFile(fp) # extract is in 2.6 # zipfo.extract('COFOG_english.mdb', cache.path) out = zipfo.read(db_filename)