def fetch_data(config): """Fetches realtime data and generates records""" ckan = CKAN(config['ENDPOINT'], apikey=config['API_KEY']) # r = ckan.fetch_resource(config['RID']) # if using ckanutils resource = ckan.action.resource_show(id=config['RID']) url = resource.get('perma_link') or resource.get('url') r = requests.get(url, stream=True) if any('403' in h.headers.get('x-ckan-error', '') for h in r.history): raise NotAuthorized( 'Access to fetch resource %s was denied.' % config['RID']) try: ext = splitext(url)[1].split('.')[1] except IndexError: ext = cv.ctype2ext(r.headers['Content-Type']) if ext == 'csv': records = io.read_csv(r.raw, sanitize=True, encoding=r.encoding) elif ext in {'xls', 'xlsx'}: r = requests.get(url) f = SpooledTemporaryFile() f.write(r.content) records = io.read_xls(f, sanitize=True, encoding=r.encoding) else: msg = 'Filetype `%s` unsupported.' msg += 'Please view tabutils.io documentation for assistance.' raise TypeError(msg) constraints = [('adm0_name', 'a'), ('mp_month', '3'), ('mp_year', '2015')] filterer = lambda x: all(x[k].lower().startswith(v) for k, v in constraints) return it.ifilter(filterer, records)
def fetch_data(config): """Fetches realtime data and generates records""" ckan = CKAN(config['ENDPOINT'], apikey=config['API_KEY']) # r = ckan.fetch_resource(config['RID']) # if using ckanutils resource = ckan.action.resource_show(id=config['RID']) url = resource.get('perma_link') or resource.get('url') r = requests.get(url, stream=True) if any('403' in h.headers.get('x-ckan-error', '') for h in r.history): raise NotAuthorized('Access to fetch resource %s was denied.' % config['RID']) try: ext = splitext(url)[1].split('.')[1] except IndexError: ext = cv.ctype2ext(r.headers['Content-Type']) if ext == 'csv': records = io.read_csv(r.raw, sanitize=True, encoding=r.encoding) elif ext in {'xls', 'xlsx'}: r = requests.get(url) f = SpooledTemporaryFile() f.write(r.content) records = io.read_xls(f, sanitize=True, encoding=r.encoding) else: msg = 'Filetype `%s` unsupported.' msg += 'Please view tabutils.io documentation for assistance.' raise TypeError(msg) constraints = [('adm0_name', 'a'), ('mp_month', '3'), ('mp_year', '2015')] filterer = lambda x: all(x[k].lower().startswith(v) for k, v in constraints) return it.ifilter(filterer, records)
def gen_data(location=None, **kwargs): """Fetches realtime data and generates records""" url = '%s/%s' % (kwargs['BASE_URL'], location) r = requests.get(url) f = SpooledTemporaryFile() # wrap to access `fileno` f.write(r.content) return io.read_xls(r., sanitize=True, encoding=r.encoding)
def test_xls(self): filepath = p.join(io.DATA_DIR, "test.xlsx") records = io.read_xls(filepath, sanitize=True, sheet=0) nt.assert_equal(self.sheet0, next(records)) with open(filepath, "r+b") as f: records = io.read_xls(f, sanitize=True, sheet=0) nt.assert_equal(self.sheet0, next(records)) records = io.read_xls(filepath, sanitize=True, sheet=1) nt.assert_equal(self.sheet1, next(records)) kwargs = {"first_row": 1, "first_col": 1} records = io.read_xls(filepath, sanitize=True, sheet=2, **kwargs) nt.assert_equal(self.sheet0, next(records)) records = io.read_xls(filepath, sanitize=True, sheet=3, **kwargs) nt.assert_equal(self.sheet1, next(records))
def gen_data(**kwargs): """Fetches realtime data and generates records""" url = kwargs['BASE_URL'] # find the newest year r = requests.get(url) soup = BeautifulSoup(r.text, 'html.parser') links = [link.get('href') for link in soup.find_all('a')] year = sorted(filter(lambda l: l.startswith('2'), links))[-1] url += year # find the newest month r = requests.get(url) soup = BeautifulSoup(r.text, 'html.parser') links = [link.get('href') for link in soup.find_all('a')] month = sorted(filter(lambda l: l[0] in {'0', '1'}, links))[-1] url += month # find the newest file r = requests.get(url) soup = BeautifulSoup(r.text, 'html.parser') links = [link.get('href') for link in soup.find_all('a')] name = 'acled-all-africa-file' filterer = lambda l: l.lower().startswith(name) and 'xls' in l files = sorted(filter(filterer, links)) if files: ftups = [(f.split('.')[0].split('-')[5], f) for f in files] filename = sorted(ftups)[-1][1] url += filename # download the file r = requests.get(url) f = SpooledTemporaryFile() # wrap to access `fileno` f.write(r.content) records = io.read_xls(f, sanitize=True, encoding=r.encoding) year = dt.now().year filtered = it.ifilter(lambda r: int(float(r['year'])) == year, records) for record in filtered: month = parse(record['event_date']).month month = '0%s' % month if month < 10 else month record['year_month'] = '%s%s' % (year, month) yield record