Beispiel #1
0
def fetch_query(state, query):
    # TODO: make a better mapping here
    res = None
    try:
        if query.type in ['arcgis', 'json', 'ckan', 'soda']:
            res = request_and_parse(query.url, query.params)
        elif query.type in ['csv']:
            res = request_csv(query.url,
                              query.params,
                              header=query.header,
                              encoding=query.encoding)
        elif query.type in ['html']:
            res = request(query.url, query.params, query.encoding)
        elif query.type in ['html:soup']:
            res = request_soup(query.url, query.params, query.encoding)
        elif query.type in ['pandas', 'xls', 'xlsx']:
            res = request_pandas(query)
        else:
            # the default is to send the URL as is
            # TODO: It's used for something, but it's not great
            res = query.url
    except Exception:
        logging.error("{}: Failed to fetch {}".format(state, query.url),
                      exc_info=True)
        raise

    return res
Beispiel #2
0
def main(cfg):
    print(cfg.pretty(resolve=True))
    sources = build_sources(cfg.dataset.sources_file, cfg.dataset.mapping_file)

    ri_source = sources[RI]
    queries = ri_source.queries
    mapping = ri_source.mapping

    # need to verify the correct day + day of week
    qs = [q for q in queries if q.type == 'pandas']
    if len(qs) > 1:
        print("Don't know which query to choose", [q.get('desc') for q in qs])
        sys.exit(1)

    df = request_pandas(qs[0])
    df = df.rename(columns=mapping)
    df['DATE_INDEX'] = pd.to_datetime(df['DATE'])
    df = df.set_index('DATE_INDEX').sort_index()

    df = df[[v for k, v in mapping.items() if k != '__strptime']]

    # We need te last C days
    # and then, we need to match what would fit Sat-Sun, shifted by 1 day
    df = df.tail(cfg.backfill.skip + cfg.backfill.fill)

    # verify that the dates make sense: we're looking at the most recent day
    yesterday = datetime.now().date() - timedelta(days=1)
    assert df.index[-1].date() == yesterday, \
        "Expecting last date to be yesterday, got %r" % df.index[-1].date()
    assert df.index[-1].day_name() == cfg.backfill.DOW, \
        "Expecting backfill day to be " + cfg.backfill.DOW + ", got " + df.index[-1].day_name()

    # Prepare the request
    if 'POSITIVE' not in df.columns:
        df['POSITIVE'] = df['CONFIRMED']
    if 'STATE' not in df.columns:
        df['STATE'] = RI
    shifted = df['DATE'] = df.index.shift(periods=cfg.backfill.shift, freq='d')
    df['DATE'] = shifted.strftime(cfg.output_date_format)
    df['lastUpdateTime'] = datetime.now(tz=timezone.utc).isoformat()

    print(df)
    # rename
    columns_renames = {k: v.value for k, v in Fields.__members__.items()}
    # one last update
    columns_renames['TOTAL'] = 'totalTestsPeopleViral'
    data = df.rename(columns=columns_renames). \
        head(cfg.backfill.fill). \
        dropna(axis=1). \
        to_dict(orient='records')

    request_content = internal_client.build_edit_request(
        data, username=cfg.api.username)
    if cfg.api.url:
        internal_client.api_call(request_content,
                                 url=cfg.api.url,
                                 token=cfg.creds.token,
                                 staging=cfg.api.staging)
Beispiel #3
0
    def fetch_state(self, state):
        ''' Fetch data for a single state, returning a tuple of
        (fetched_result, parsed_data)

        If there's no query for the state: return (None, _)
        '''
        logging.debug("Fetching: %s", state)
        res = None

        queries = self.sources.queries_for(state)
        if not queries:
            return res, {}

        results = []
        mapping = self.sources.mapping_for(state)
        for query in queries:
            # TODO: make a better mapping here
            try:
                if query['type'] in ['arcgis', 'json', 'ckan', 'soda']:
                    res = request_and_parse(query['url'], query['params'])
                elif query['type'] in ['csv']:
                    res = request_csv(query['url'],
                                      query['params'],
                                      header=query.get('header', True),
                                      encoding=query.get('encoding'))
                elif query['type'] in ['html']:
                    res = request(query['url'], query['params'])
                elif query['type'] in ['html:soup']:
                    res = request_soup(query['url'], query['params'])
                elif query['type'] in ['pandas', 'xls', 'xlsx']:
                    res = request_pandas(query)
                results.append(res)
            except Exception:
                logging.error("{}: Failed to fetch {}".format(
                    state, query['url']),
                              exc_info=True)
                raise

        processed_results = []
        if state in self.extras:
            processed_results = self.extras[state](results, mapping)
        else:
            for i, result in enumerate(results):
                if queries[i].get('type') == 'arcgis':
                    partial = extract_arcgis_attributes(result, mapping, state)
                else:
                    # This is a guess; getting an unknown top level object
                    partial = extract_attributes(
                        result, queries[i].get('data_path', []), mapping,
                        state)
                processed_results.append(partial)

        data = self._aggregate_state_results(state, processed_results, mapping)
        return results, data