Beispiel #1
0
def fetch_data(config):
    """Fetches realtime data and generates records"""
    ckan = CKAN(config['ENDPOINT'], apikey=config['API_KEY'])
    # r = ckan.fetch_resource(config['RID'])  # if using ckanutils
    resource = ckan.action.resource_show(id=config['RID'])
    url = resource.get('perma_link') or resource.get('url')
    r = requests.get(url, stream=True)

    if any('403' in h.headers.get('x-ckan-error', '') for h in r.history):
        raise NotAuthorized(
            'Access to fetch resource %s was denied.' % config['RID'])

    try:
        ext = splitext(url)[1].split('.')[1]
    except IndexError:
        ext = cv.ctype2ext(r.headers['Content-Type'])

    if ext == 'csv':
        records = io.read_csv(r.raw, sanitize=True, encoding=r.encoding)
    elif ext in {'xls', 'xlsx'}:
        r = requests.get(url)
        f = SpooledTemporaryFile()
        f.write(r.content)
        records = io.read_xls(f, sanitize=True, encoding=r.encoding)
    else:
        msg = 'Filetype `%s` unsupported.'
        msg += 'Please view tabutils.io documentation for assistance.'
        raise TypeError(msg)

    constraints = [('adm0_name', 'a'), ('mp_month', '3'), ('mp_year', '2015')]

    filterer = lambda x: all(x[k].lower().startswith(v) for k, v in constraints)
    return it.ifilter(filterer, records)
Beispiel #2
0
def fetch_data(config):
    """Fetches realtime data and generates records"""
    ckan = CKAN(config['ENDPOINT'], apikey=config['API_KEY'])
    # r = ckan.fetch_resource(config['RID'])  # if using ckanutils
    resource = ckan.action.resource_show(id=config['RID'])
    url = resource.get('perma_link') or resource.get('url')
    r = requests.get(url, stream=True)

    if any('403' in h.headers.get('x-ckan-error', '') for h in r.history):
        raise NotAuthorized('Access to fetch resource %s was denied.' %
                            config['RID'])

    try:
        ext = splitext(url)[1].split('.')[1]
    except IndexError:
        ext = cv.ctype2ext(r.headers['Content-Type'])

    if ext == 'csv':
        records = io.read_csv(r.raw, sanitize=True, encoding=r.encoding)
    elif ext in {'xls', 'xlsx'}:
        r = requests.get(url)
        f = SpooledTemporaryFile()
        f.write(r.content)
        records = io.read_xls(f, sanitize=True, encoding=r.encoding)
    else:
        msg = 'Filetype `%s` unsupported.'
        msg += 'Please view tabutils.io documentation for assistance.'
        raise TypeError(msg)

    constraints = [('adm0_name', 'a'), ('mp_month', '3'), ('mp_year', '2015')]

    filterer = lambda x: all(x[k].lower().startswith(v)
                             for k, v in constraints)
    return it.ifilter(filterer, records)
def gen_data(location=None, **kwargs):
    """Fetches realtime data and generates records"""
    url = '%s/%s' % (kwargs['BASE_URL'], location)
    r = requests.get(url)
    f = SpooledTemporaryFile()  # wrap to access `fileno`
    f.write(r.content)
    return io.read_xls(r., sanitize=True, encoding=r.encoding)
Beispiel #4
0
    def test_xls(self):
        filepath = p.join(io.DATA_DIR, "test.xlsx")
        records = io.read_xls(filepath, sanitize=True, sheet=0)
        nt.assert_equal(self.sheet0, next(records))

        with open(filepath, "r+b") as f:
            records = io.read_xls(f, sanitize=True, sheet=0)
            nt.assert_equal(self.sheet0, next(records))

        records = io.read_xls(filepath, sanitize=True, sheet=1)
        nt.assert_equal(self.sheet1, next(records))

        kwargs = {"first_row": 1, "first_col": 1}
        records = io.read_xls(filepath, sanitize=True, sheet=2, **kwargs)
        nt.assert_equal(self.sheet0, next(records))

        records = io.read_xls(filepath, sanitize=True, sheet=3, **kwargs)
        nt.assert_equal(self.sheet1, next(records))
Beispiel #5
0
def gen_data(**kwargs):
    """Fetches realtime data and generates records"""
    url = kwargs['BASE_URL']

    # find the newest year
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    links = [link.get('href') for link in soup.find_all('a')]
    year = sorted(filter(lambda l: l.startswith('2'), links))[-1]
    url += year

    # find the newest month
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    links = [link.get('href') for link in soup.find_all('a')]
    month = sorted(filter(lambda l: l[0] in {'0', '1'}, links))[-1]
    url += month

    # find the newest file
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    links = [link.get('href') for link in soup.find_all('a')]
    name = 'acled-all-africa-file'
    filterer = lambda l: l.lower().startswith(name) and 'xls' in l
    files = sorted(filter(filterer, links))

    if files:
        ftups = [(f.split('.')[0].split('-')[5], f) for f in files]
        filename = sorted(ftups)[-1][1]
        url += filename

        # download the file
        r = requests.get(url)
        f = SpooledTemporaryFile()  # wrap to access `fileno`
        f.write(r.content)

        records = io.read_xls(f, sanitize=True, encoding=r.encoding)
        year = dt.now().year
        filtered = it.ifilter(lambda r: int(float(r['year'])) == year, records)

        for record in filtered:
            month = parse(record['event_date']).month
            month = '0%s' % month if month < 10 else month
            record['year_month'] = '%s%s' % (year, month)
            yield record