コード例 #1
0
def download_data(date, base_url, countryiso3s, input_cols, downloader):
    url = base_url % date.strftime('%b%Y')
    countries_index = dict()
    while url:
        r = downloader.download(url)
        json = r.json()
        for result in json['results']:
            countryiso3 = result['iso3']
            if len(countryiso3) != 1:
                continue
            countryiso3 = countryiso3[0]
            if countryiso3 not in countryiso3s:
                continue
            if result['country_level'] != 'Yes':
                continue
            first_val = result[input_cols[0]]
            if not first_val:
                continue
            country_index = countries_index.get(countryiso3, dict())
            individual_or_aggregated = result['individual_aggregated']
            type_of_crisis = result['type_of_crisis']
            ind_agg_type = country_index.get('ind_agg_type', dict())
            dict_of_lists_add(ind_agg_type, individual_or_aggregated, type_of_crisis)
            country_index['ind_agg_type'] = ind_agg_type
            crises_index = country_index.get('crises', dict())
            crisis_index = crises_index.get(type_of_crisis, dict())
            last_updated = result['Last updated']
            for input_col in input_cols:
                crisis_index[input_col] = (result[input_col], last_updated)
            crises_index[type_of_crisis] = crisis_index
            country_index['crises'] = crises_index
            countries_index[countryiso3] = country_index
        url = json['next']
    return countries_index
コード例 #2
0
ファイル: hdro.py プロジェクト: OCHA-DAP/hdx-scraper-hdro
def get_countriesdata(hdro_url, downloader):
    response = downloader.download(hdro_url)
    countriesdata = dict()
    for row in response.json():
        countryiso = row['country_code']
        dict_of_lists_add(countriesdata, countryiso, row)
    return countriesdata
コード例 #3
0
def get_countriesdata(download_url, downloader):
    countrynameisomapping = dict()
    countriesdata = dict()
    headers, iterator = downloader.get_tabular_rows(download_url,
                                                    headers=1,
                                                    dict_form=True)
    countries = list()
    for row in iterator:
        countryname = row['country']
        countryiso = countrynameisomapping.get(countryname)
        if countryiso is None:
            countryiso, _ = Country.get_iso3_country_code_fuzzy(
                countryname, exception=ValueError)
            countrynameisomapping[countryname] = countryiso
            countries.append({
                'iso3':
                countryiso,
                'countryname':
                Country.get_country_name_from_iso3(countryiso),
                'origname':
                countryname
            })
        row['iso3'] = countryiso
        dict_of_lists_add(countriesdata, countryiso, row)
    headers.insert(30, 'iso3')
    headers.insert(3, 'end_year')
    headers.insert(3, 'start_year')
    return countries, headers, countriesdata
コード例 #4
0
 def add_row(row):
     adm, indicators_process = rowparser.do_set_value(row, name)
     if not adm:
         return
     for i, indicatorcol in enumerate(indicatorcols):
         if not indicators_process[i]:
             continue
         filtercol = indicatorcol['filter_col']
         total_cols = indicatorcol.get('total_cols')
         eval_cols = indicatorcol.get('eval_cols')
         append_cols = indicatorcol.get('append_cols', list())
         keep_cols = indicatorcol.get('keep_cols', list())
         for i, valcol in enumerate(indicatorcol['val_cols']):
             valuedict = valuedicts[filtercol][i]
             val = get_rowval(row, valcol)
             if total_cols or eval_cols:
                 dict_of_lists_add(valuedict, adm, val)
             else:
                 curval = valuedict.get(adm)
                 if valcol in append_cols:
                     if curval:
                         val = curval + val
                 elif valcol in keep_cols:
                     if curval:
                         val = curval
                 valuedict[adm] = val
コード例 #5
0
    def __init__(
        self,
        site_url: str,
        users: Optional[List[User]] = None,
        organizations: Optional[List[Organization]] = None,
    ):
        self.site_url = site_url
        if users is None:  # pragma: no cover
            users = User.get_all_users()
        self.users: Dict[str, User] = dict()
        self.sysadmins = dict()
        for user in users:
            userid = user["id"]
            self.users[userid] = user
            if user["sysadmin"]:
                self.sysadmins[userid] = user

        self.organizations: Dict = dict()
        if organizations is None:  # pragma: no cover
            organizations: List = Organization.get_all_organization_names(
                all_fields=True, include_users=True
            )
        for organization in organizations:
            users_per_capacity = dict()
            for user in organization["users"]:
                dict_of_lists_add(
                    users_per_capacity, user["capacity"], user["id"]
                )
            self.organizations[organization["id"]] = users_per_capacity
コード例 #6
0
 def add_row(row):
     adm, _ = rowparser.do_set_value(row, name)
     if not adm:
         return
     for indicatorcol in indicatorcols:
         filtercol = indicatorcol['filter_col']
         if filtercol:
             filtercols = filtercol.split(',')
             match = True
             for filterstr in filtercols:
                 filter = filterstr.split('=')
                 if row[filter[0]] != filter[1]:
                     match = False
                     break
             if not match:
                 continue
         total_col = indicatorcol.get('total_col')
         eval_cols = indicatorcol.get('eval_cols')
         append_cols = indicatorcol.get('append_cols', list())
         keep_cols = indicatorcol.get('keep_cols', list())
         for i, valcol in enumerate(indicatorcol['val_cols']):
             valuedict = valuedicts[filtercol][i]
             val = get_rowval(row, valcol)
             if total_col or eval_cols:
                 dict_of_lists_add(valuedict, adm, val)
             else:
                 curval = valuedict.get(adm)
                 if valcol in append_cols:
                     if curval:
                         val = curval + val
                 elif valcol in keep_cols:
                     if curval:
                         val = curval
                 valuedict[adm] = val
コード例 #7
0
    def read_external_filter(self, datasetinfo):
        # type: (Dict) -> Tuple[List[str],Iterator[Union[List,Dict]]]
        """Read filter list from external url poitning to a HXLated file

        Args:
            datasetinfo (Dict): Dictionary of information about dataset

        Returns:
            None
        """
        external_filter = datasetinfo.get('external_filter')
        if not external_filter:
            return
        hxltags = external_filter['hxltags']
        data = hxl.data(external_filter['url'])
        use_hxl = datasetinfo.get('use_hxl', False)
        for row in data:
            for hxltag in data.columns:
                if hxltag.display_tag in hxltags:
                    if use_hxl:
                        header = hxltag.display_tag
                    else:
                        header = hxltag.header
                    dict_of_lists_add(self.filters, header,
                                      row.get('#country+code'))
コード例 #8
0
 def add_row(row):
     adm, should_process_subset = rowparser.parse(row, name)
     if not adm:
         return
     for i, subset in enumerate(subsets):
         if not should_process_subset[i]:
             continue
         filter = subset['filter']
         input_ignore_vals = subset.get('input_ignore_vals', list())
         input_transforms = subset.get('input_transforms', dict())
         sum_cols = subset.get('sum_cols')
         process_cols = subset.get('process_cols')
         input_append = subset.get('input_append', list())
         input_keep = subset.get('input_keep', list())
         for i, valcol in enumerate(subset['input_cols']):
             valuedict = valuedicts[filter][i]
             val = get_rowval(row, valcol)
             input_transform = input_transforms.get(valcol)
             if input_transform and val not in input_ignore_vals:
                 val = eval(input_transform.replace(valcol, 'val'))
             if sum_cols or process_cols:
                 dict_of_lists_add(valuedict, adm, val)
             else:
                 curval = valuedict.get(adm)
                 if valcol in input_append:
                     if curval:
                         val = curval + val
                 elif valcol in input_keep:
                     if curval:
                         val = curval
                 valuedict[adm] = val
コード例 #9
0
 def add_row(row, filepath, indicatorsetname):
     row["path"] = filepath
     quickcharts = indicatorsetname.get("quickcharts")
     if quickcharts and row["DatasetCode"] == quickcharts["code"]:
         row["quickcharts"] = quickcharts["indicators"]
     else:
         row["quickcharts"] = None
     dict_of_lists_add(indicatorsets, indicatorsetname["category"], row)
コード例 #10
0
    def prepare_user_emails(
        hdxhelper: HDXHelper,
        include_datasetdate: bool,
        datasets: List[Dict],
        sheet: Sheet,
        sheetname: str,
    ) -> Dict[str, List]:
        """Prepare emails to users

        Args:
            hdxhelper (HDXHelper): HDX helper object
            include_datasetdate (bool): Whether to include dataset date in output
            datasets (List[Dict]): List of datasets
            sheet (Sheet): Sheet object
            sheetname (str): Name of sheet

        Returns:
            Dict[str, List]: Emails to users
        """

        all_users_to_email = dict()
        datasets_flat = list()
        for dataset in sorted(
            datasets, key=lambda d: (d["organization_title"], d["name"])
        ):
            (
                maintainer,
                orgadmins,
                users_to_email,
            ) = hdxhelper.get_maintainer_orgadmins(dataset)
            (
                dataset_string,
                dataset_html_string,
            ) = hdxhelper.create_dataset_string(
                dataset,
                maintainer,
                orgadmins,
                include_datasetdate=include_datasetdate,
            )
            for user in users_to_email:
                id = user["id"]
                dict_of_lists_add(
                    all_users_to_email,
                    id,
                    (dataset_string, dataset_html_string),
                )
            row = sheet.construct_row(
                hdxhelper, dataset, maintainer, orgadmins
            )
            if include_datasetdate:
                start_date, end_date = hdxhelper.get_dataset_dates(dataset)
                row["Dataset Start Date"] = start_date.isoformat()
                row["Dataset End Date"] = end_date.isoformat()
            datasets_flat.append(row)
        if sheetname is not None:
            sheet.update(sheetname, datasets_flat)
        return all_users_to_email
コード例 #11
0
 def get_external_filter(self, datasetinfo):
     external_filter = datasetinfo.get('external_filter')
     if not external_filter:
         return
     hxltags = external_filter['hxltags']
     data = hxl.data(external_filter['url'])
     for row in data:
         for hxltag in data.columns:
             if hxltag.display_tag in hxltags:
                 dict_of_lists_add(self.filters, hxltag.header, row.get('#country+code'))
コード例 #12
0
    def add_data_row(self, key, row):
        # type: (str, Dict) -> None
        """Add row to JSON under a key

        Args:
            key (str): Key in JSON to update
            rows (List[Dict]): List of dictionaries

        Returns:
            None
        """
        dict_of_lists_add(self.json, '%s_data' % key, row)
コード例 #13
0
 def test_dict_of_lists_add(self):
     d = dict()
     dict_of_lists_add(d, 'a', 1)
     assert d == {'a': [1]}
     dict_of_lists_add(d, 2, 'b')
     assert d == {'a': [1], 2: ['b']}
     dict_of_lists_add(d, 'a', 2)
     assert d == {'a': [1, 2], 2: ['b']}
     dict_of_lists_add(d, 2, 'c')
     assert d == {'a': [1, 2], 2: ['b', 'c']}
     dict_of_lists_add(d, 2, 'b')
     assert d == {'a': [1, 2], 2: ['b', 'c', 'b']}
コード例 #14
0
 def test_dict_of_lists_add(self):
     d = dict()
     dict_of_lists_add(d, "a", 1)
     assert d == {"a": [1]}
     dict_of_lists_add(d, 2, "b")
     assert d == {"a": [1], 2: ["b"]}
     dict_of_lists_add(d, "a", 2)
     assert d == {"a": [1, 2], 2: ["b"]}
     dict_of_lists_add(d, 2, "c")
     assert d == {"a": [1, 2], 2: ["b", "c"]}
     dict_of_lists_add(d, 2, "b")
     assert d == {"a": [1, 2], 2: ["b", "c", "b"]}
コード例 #15
0
def get_ipc(configuration, admininfo, downloader, scrapers=None):
    name = inspect.currentframe().f_code.co_name
    if scrapers and not any(scraper in name for scraper in scrapers):
        return list(), list(), list()
    ipc_configuration = configuration['ipc']
    url = ipc_configuration['url']
    phasedict = dict()
    popdict = dict()
    for countryiso3 in admininfo.countryiso3s:
        countryiso2 = Country.get_iso2_from_iso3(countryiso3)
        data, adm1_names = get_data(downloader, url, countryiso2)
        if not data:
            continue
        for row in data:
            country = row['Country']
            if adm1_names:
                if country not in adm1_names:
                    continue
                adm1_name = country
            else:
                adm1_name = row['Area']
                if not adm1_name or adm1_name == country:
                    continue
            pcode, _ = admininfo.get_pcode(countryiso3, adm1_name, 'IPC')
            if not pcode:
                continue
            population = row['Current Phase P3+ #']
            if population:
                dict_of_lists_add(popdict, pcode, population)
            percentage = row['Current Phase P3+ %']
            if percentage:
                dict_of_lists_add(phasedict, pcode, percentage)
    for pcode in phasedict:
        percentages = phasedict[pcode]
        if len(percentages) == 1:
            phasedict[pcode] = get_fraction_str(percentages[0])
        else:
            populations = popdict[pcode]
            numerator = 0
            denominator = 0
            for i, percentage in enumerate(percentages):
                population = populations[i]
                numerator += population * percentage
                denominator += population
            phasedict[pcode] = get_fraction_str(numerator, denominator)
    logger.info('Processed IPC')
    dataset = Dataset.read_from_hdx(ipc_configuration['dataset'])
    date = get_date_from_dataset_date(dataset)
    hxltag = '#affected+food+ipc+p3+pct'
    return [['FoodInsecurityIPCP3+'], [hxltag]], [phasedict], \
           [(hxltag, date, dataset['dataset_source'], dataset.get_hdx_url())]
コード例 #16
0
def get_countriesdata(base_url, downloader, indicators):
    def download(alias, subalias):
        url = f"{base_url}{alias}/{subalias}"
        downloader.download(url)
        json = downloader.get_json()

        return url, json["data"]

    countriesdata = dict()

    for alias in indicators:
        indicators_alias = indicators[alias]
        for subalias in indicators_alias.get("country", list()):
            url, data = download(alias, subalias)
            iso3s = set()
            for info in data:
                iso3 = info["iso3"]
                if iso3 in iso3s:
                    continue
                iso3s.add(iso3)
                countrydata = countriesdata.get(iso3, dict())
                countryalias = countrydata.get(alias, dict())
                dict_of_lists_add(countryalias, subalias, f"{url}?iso3={iso3}")
                countrydata[alias] = countryalias
                countriesdata[iso3] = countrydata
        subalias = indicators_alias.get("global")
        if subalias:
            url, data = download(alias, subalias)
            countrydata = countriesdata.get("World", dict())
            countryalias = countrydata.get(alias, dict())
            countryalias[subalias] = [f"{url}?id={x['id']}" for x in data]
            countrydata[alias] = countryalias
            countriesdata["World"] = countrydata

    countries = [{"iso3": x} for x in sorted(countriesdata.keys()) if x != "World"]
    countries.append({"iso3": "World"})
    return countriesdata, countries
コード例 #17
0
def get_covax_deliveries(configuration, today, countryiso3s, downloader, scrapers=None):
    name = 'covax_deliveries'
    if scrapers and not any(scraper in name for scraper in scrapers):
        return list(), list(), list()
    datasetinfo = configuration[name]
    headers, iterator = read(downloader, datasetinfo, today=today)
    hxlrow = next(iterator)
    doses_lookup = dict()
    for row in iterator:
        newrow = dict()
        for key in row:
            newrow[hxlrow[key]] = row[key]
        countryiso = newrow['#country+code']
        if not countryiso or countryiso not in countryiso3s:
            continue
        key = f'{countryiso}|{newrow["#meta+vaccine+pipeline"]}|{newrow["#meta+vaccine+producer"]}|{newrow["#meta+vaccine+funder"]}'
        nodoses = get_numeric_if_possible(newrow['#capacity+vaccine+doses'])
        if nodoses:
            doses_lookup[key] = doses_lookup.get(key, 0) + nodoses
    pipelines = dict()
    producers = dict()
    funders = dict()
    doses = dict()
    for key in sorted(doses_lookup):
        countryiso, pipeline, producer, funder = key.split('|')
        dict_of_lists_add(pipelines, countryiso, pipeline)
        dict_of_lists_add(producers, countryiso, producer)
        dict_of_lists_add(funders, countryiso, funder)
        dict_of_lists_add(doses, countryiso, str(doses_lookup[key]))
    for countryiso in pipelines:
        pipelines[countryiso] = '|'.join(pipelines[countryiso])
        producers[countryiso] = '|'.join(producers[countryiso])
        funders[countryiso] = '|'.join(funders[countryiso])
        doses[countryiso] = '|'.join(doses[countryiso])
    logger.info('Processed covax deliveries')
    hxltags = ['#meta+vaccine+pipeline', '#meta+vaccine+producer', '#meta+vaccine+funder', '#capacity+vaccine+doses']
    return [['Pipeline', 'Vaccine', 'Funder', 'Doses'], hxltags], \
           [pipelines, producers, funders, doses], [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url']) for hxltag in hxltags]
コード例 #18
0
    def generate_dataset_and_showcase(self, countryiso3, folder):
        countryname = Country.get_country_name_from_iso3(countryiso3)
        title = f'{countryname} - Food Prices'
        logger.info(f'Creating dataset: {title}')
        name = f'WFP food prices for {countryname}'
        slugified_name = slugify(name).lower()

        dataset = Dataset({
            'name': slugified_name,
            'title': title,
        })
        dataset.set_maintainer('f1921552-8c3e-47e9-9804-579b14a83ee3')
        dataset.set_organization('3ecac442-7fed-448d-8f78-b385ef6f84e7')

        dataset.set_expected_update_frequency('weekly')
        dataset.add_country_location(countryname)
        dataset.set_subnational(True)
        tags = ['commodities', 'prices', 'markets', 'hxl']
        dataset.add_tags(tags)

        prices_data = self.get_list('MarketPrices/PriceMonthly', countryiso3)
        if not prices_data:
            logger.info(f'{countryiso3} has no prices data!')
            return None, None, None
        market_to_adm = dict()
        for market in self.get_list('Markets/List', countryiso3):
            market_to_adm[market['marketId']] = market['admin1Name'], market['admin2Name'], market['marketLatitude'],\
                                                market['marketLongitude']

        rows = dict()
        sources = dict()
        markets = dict()
        for price_data in prices_data:
            if price_data['commodityPriceFlag'] not in ('actual', 'aggregate'):
                continue
            date = price_data['commodityPriceDate']
            category = self.commodity_to_category[price_data['commodityID']]
            market = price_data['marketName']
            if market == 'National Average':
                adm1 = adm2 = lat = lon = ''
            else:
                market_id = price_data['marketID']
                if market_id in market_to_adm:
                    adm1, adm2, lat, lon = market_to_adm[market_id]
                else:
                    adm1 = adm2 = lat = lon = ''
            orig_source = price_data['commodityPriceSourceName'].replace(
                'M/o', 'Ministry of').replace('+', '/')
            regex = r'Government.*,(Ministry.*)'
            match = re.search(regex, orig_source)
            if match:
                split_sources = [match.group(1)]
            else:
                split_sources = orig_source.replace(',', '/').replace(
                    ';', '/').split('/')
            for source in split_sources:
                source = source.strip()
                if not source:
                    continue
                if source[-1] == '.':
                    source = source[:-1]
                source_lower = source.lower()
                if 'mvam' in source_lower and len(source_lower) <= 8:
                    source = 'WFP mVAM'
                elif '?stica' in source:
                    source = source.replace('?stica', 'ística')
                source_lower = source.lower()
                if not self.match_source(sources.keys(), source_lower):
                    sources[source_lower] = source
            commodity = price_data['commodityName']
            unit = price_data['commodityUnitName']
            price = price_data['commodityPrice']
            currency = price_data['currencyName']
            pricetype = price_data['commodityPriceFlag']
            key = date, adm1, adm2, market, category, commodity, unit
            rows[key] = {
                'date': date,
                'adm1name': adm1,
                'adm2name': adm2,
                'market': market,
                'latitude': lat,
                'longitude': lon,
                'category': category,
                'commodity': commodity,
                'unit': unit,
                'currency': currency,
                'pricetype': pricetype,
                'price': price
            }
            if adm1 and adm2 and category:
                adm1adm2market = adm1, adm2, market
                commodities = markets.get(adm1adm2market, dict())
                dict_of_lists_add(commodities, (commodity, unit, currency),
                                  (date, price))
                markets[adm1adm2market] = commodities
        if not rows:
            logger.info(f'{countryiso3} has no prices!')
            return None, None, None
        number_market = list()
        for key, commodities in markets.items():
            number_market.append((len(commodities), key))
        number_market = sorted(number_market, reverse=True)
        qc_indicators = list()
        qc_rows = [qc_hxltags]
        chosen_commodities = set()
        # Go through markets starting with the one with most commodities
        for _, adm1adm2market in number_market:
            commodities = markets[adm1adm2market]
            number_commodity = list()
            for commodityunitcurrency, details in commodities.items():
                number_commodity.append((len(details), commodityunitcurrency))
            number_commodity = sorted(number_commodity, reverse=True)
            index = 0
            # Pick commodity with most rows that has not already been used for another market
            commodity, unit, currency = number_commodity[index][1]
            while commodity in chosen_commodities:
                index += 1
                if index == len(number_commodity):
                    commodity, unit, currency = number_commodity[0][1]
                    break
                commodity, unit, currency = number_commodity[index][1]
            adm1, adm2, market = adm1adm2market
            code = f'{adm1}-{adm2}-{market}-{commodity}-{unit}-{currency}'
            for date, price in sorted(commodities[(commodity, unit,
                                                   currency)]):
                qc_rows.append({'date': date, 'code': code, 'price': price})
            chosen_commodities.add(commodity)
            marketname = market
            if adm2 != market:
                marketname = f'{adm2}/{marketname}'
            if adm1 != adm2:
                marketname = f'{adm1}/{marketname}'
            qc_indicators.append({
                'code': code,
                'title': f'Price of {commodity} in {market}',
                'unit': f'Currency {currency}',
                'description':
                f'Price of {commodity} ({currency}/{unit}) in {marketname}',
                'code_col': '#meta+code',
                'value_col': '#value',
                'date_col': '#date'
            })
            if len(qc_indicators) == 3:
                break
        dataset['dataset_source'] = ', '.join(sorted(sources.values()))
        filename = f'wfp_food_prices_{countryiso3.lower()}.csv'
        resourcedata = {
            'name': title,
            'description': 'Food prices data with HXL tags',
            'format': 'csv'
        }
        rows = [rows[key] for key in sorted(rows)]
        dataset.generate_resource_from_iterator(headers,
                                                rows,
                                                hxltags,
                                                folder,
                                                filename,
                                                resourcedata,
                                                datecol='date')
        filename = f'wfp_food_prices_{countryiso3.lower()}_qc.csv'
        resourcedata = {
            'name': f'QuickCharts: {title}',
            'description': 'Food prices QuickCharts data with HXL tags',
            'format': 'csv'
        }
        dataset.generate_resource_from_rows(folder,
                                            filename,
                                            qc_rows,
                                            resourcedata,
                                            headers=list(qc_hxltags.keys()))
        showcase = Showcase({
            'name':
            f'{slugified_name}-showcase',
            'title':
            f'{title} showcase',
            'notes':
            f'{countryname} food prices data from World Food Programme displayed through VAM Economic Explorer',
            'url':
            f'http://dataviz.vam.wfp.org/economic_explorer/prices?iso3={countryiso3}',
            'image_url':
            'http://dataviz.vam.wfp.org/_images/home/3_economic.jpg'
        })
        showcase.add_tags(tags)
        return dataset, showcase, qc_indicators
コード例 #19
0
 def add_data_row(self, name, row):
     dict_of_lists_add(self.json, '%s_data' % name, row)
コード例 #20
0
def get_iom_dtm(configuration, today_str, adminone, downloader, scrapers=None):
    name = inspect.currentframe().f_code.co_name
    if scrapers and not any(scraper in name for scraper in scrapers):
        return list(), list(), list()
    iom_url = configuration['iom_url']
    headers, iterator = downloader.get_tabular_rows(iom_url,
                                                    headers=1,
                                                    dict_form=True,
                                                    format='csv')
    rows = list(iterator)
    idpsdict = dict()
    for ds_row in rows:
        countryiso3 = ds_row['Country ISO']
        dataset = Dataset.read_from_hdx(ds_row['Dataset Name'])
        if not dataset:
            logger.warning('No IOM DTM data for %s.' % countryiso3)
            continue
        url = dataset.get_resource()['url']
        try:
            data = hxl.data(url).cache()
            data.display_tags
        except hxl.HXLException:
            logger.warning(
                'Could not process IOM DTM data for %s. Maybe there are no HXL tags.'
                % countryiso3)
            continue
        pcodes_found = False
        for row in data:
            pcode = row.get('#adm1+code')
            if pcode:
                pcode = adminone.convert_pcode_length(countryiso3, pcode,
                                                      'iom_dtm')
            else:
                adm2code = row.get('#adm2+code')
                if adm2code:
                    if len(adm2code) > 4:
                        pcode = adm2code[:-2]
                    else:  # incorrectly labelled adm2 code
                        pcode = adm2code
            if not pcode:
                adm1name = row.get('#adm1+name')
                if adm1name:
                    pcode, _ = adminone.get_pcode(countryiso3, adm1name,
                                                  'iom_dtm')
            if not pcode:
                location = row.get('#loc')
                if location:
                    location = location.split('>')[-1]
                    pcode, _ = adminone.get_pcode(countryiso3, location,
                                                  'iom_dtm')
            if pcode:
                pcode = pcode.strip().upper()
                idps = row.get('#affected+idps+ind')
                if idps:
                    dict_of_lists_add(idpsdict, '%s:%s' % (countryiso3, pcode),
                                      idps)
        if not pcodes_found:
            logger.warning('No pcodes found for %s.' % countryiso3)

    idps = dict()
    for countrypcode in idpsdict:
        countryiso3, pcode = countrypcode.split(':')
        if pcode not in adminone.pcodes:
            logger.error('PCode %s in %s does not exist!' %
                         (pcode, countryiso3))
        else:
            idps[pcode] = sum(idpsdict[countrypcode])
    logger.info('Processed IOM DTMs')
    return [['IDPs'], ['#affected+idps+ind']
            ], [idps], [('#affected+idps+ind', today_str, 'IOM', iom_url)]
コード例 #21
0
def main(file_path, hdx_key, user_agent, preprefix, hdx_site, db_url, db_params, gsheet_auth):
    if db_params:
        params = args_to_dict(db_params)
    elif db_url:
        params = Database.get_params_from_sqlalchemy_url(db_url)
    else:
        params = {'driver': 'sqlite', 'database': 'freshness.db'}
    logger.info('> Database parameters: %s' % params)
    with Database(**params) as session:
        info = json.loads(gsheet_auth)
        scopes = ['https://www.googleapis.com/auth/spreadsheets', 'https://www.googleapis.com/auth/drive']
        credentials = service_account.Credentials.from_service_account_info(info, scopes=scopes)
        gc = pygsheets.authorize(custom_credentials=credentials)
        configuration = load_yaml('project_configuration.yml')
        spreadsheet = gc.open_by_url(configuration['spreadsheet_url'])
        sheet = spreadsheet.worksheet_by_title('datasets')
        sheet.clear()
        rows = [['update freq', 'fresh', 'no days', 'title', 'run date', 'last modified', 'dataset date', 'dataset end date', 'org title', 'URL', 'id', 'org id', 'maintainer', 'what updated', 'resources']]
        run_number, run_date = session.query(DBRun.run_number, DBRun.run_date).order_by(DBRun.run_number.desc()).first()
        logger.info('Run number is %d' % run_number)

        datasetcolumns = [DBDataset.update_frequency, DBDataset.fresh, DBInfoDataset.title, DBDataset.last_modified,
                          DBDataset.dataset_date, DBOrganization.title.label('organization_title'), DBInfoDataset.name,
                          DBDataset.id, DBOrganization.id.label('organization_id'), DBInfoDataset.maintainer, DBDataset.what_updated]

        resourcecolumns = [DBDataset.id, DBResource.url]

        def get_datasets(update_frequency, fresh):
            filters = [DBDataset.run_number == run_number, DBDataset.id == DBInfoDataset.id,
                       DBInfoDataset.organization_id == DBOrganization.id,
                       DBDataset.fresh == fresh, DBDataset.update_frequency == update_frequency]
            return session.query(*datasetcolumns).filter(and_(*filters))

        def get_resources(dataset_ids):
            filters = [DBDataset.run_number == run_number, DBResource.run_number == run_number,
                       DBDataset.id == DBResource.dataset_id, DBDataset.id.in_(dataset_ids)]
            return session.query(*resourcecolumns).filter(and_(*filters))

        fresh_values = [0, 1, 2, 3]
        update_frequencies = [1, 7, 14, 30, 180, 365]

        repobase = '%s/tree/master/datasets/' % configuration['repo']
        dir = join(file_path, 'datasets')
        rmtree(dir, ignore_errors=True)
        mkdir(dir)

        with Download(user_agent=user_agent, preprefix=preprefix) as downloader:
            status_forcelist = [429, 500, 502, 503, 504]
            method_whitelist = frozenset(['HEAD', 'TRACE', 'GET', 'PUT', 'OPTIONS', 'DELETE'])
            retries = Retry(total=1, backoff_factor=0.4, status_forcelist=status_forcelist,
                            method_whitelist=method_whitelist,
                            raise_on_redirect=True,
                            raise_on_status=True)
            downloader.session.mount('http://', HTTPAdapter(max_retries=retries, pool_connections=100, pool_maxsize=100))
            downloader.session.mount('https://', HTTPAdapter(max_retries=retries, pool_connections=100, pool_maxsize=100))

            for update_frequency in update_frequencies:
                for fresh in fresh_values:
                    org_ids = list()
                    results = get_datasets(update_frequency, fresh)
                    datasets = list()
                    ids = list()
                    datasets_urls = dict()
                    for dataset in results:
                        dataset = list(dataset)
                        datasets.append(dataset)
                        ids.append(dataset[7])
                    for result in get_resources(ids):
                        resource = list(result)
                        dict_of_lists_add(datasets_urls, resource[0], resource[1])
                    for dataset in datasets:
                        org_id = dataset[8]
                        if org_id in org_ids:
                            continue
                        dataset = list(dataset)
                        dataset[0] = Dataset.transform_update_frequency(str(update_frequency))
                        fresh = dataset[1]
                        if fresh == 0:
                            dataset[1] = 'fresh'
                        elif fresh == 1:
                            dataset[1] = 'due'
                        elif fresh == 2:
                            dataset[1] = 'overdue'
                        elif fresh == 3:
                            dataset[1] = 'delinquent'
                        last_modified = dataset[3]
                        dataset[3] = last_modified.isoformat()
                        nodays = (run_date - last_modified).days
                        dataset.insert(2, nodays)
                        dataset.insert(4, run_date.isoformat())
                        dataset_date = dataset[6]
                        if '-' in dataset_date:
                            dataset_date = dataset_date.split('-')
                            dataset[6] = datetime.strptime(dataset_date[0], '%m/%d/%Y').date().isoformat()
                            dataset.insert(7, datetime.strptime(dataset_date[1], '%m/%d/%Y').date().isoformat())
                        else:
                            dataset[6] = datetime.strptime(dataset_date, '%m/%d/%Y').date().isoformat()
                            dataset.insert(7, '')
                        dataset_name = dataset[9]
                        dataset[9] = 'https://data.humdata.org/dataset/%s' % dataset_name
                        org_ids.append(org_id)
                        if len(org_ids) == 6:
                            break
                        urls = datasets_urls[dataset[10]]
                        if len(urls) != 0:
                            datasetdir = join(dir, dataset_name)
                            mkdir(datasetdir)
                            for url in urls:
                                urlpath = urlsplit(url).path
                                filename = basename(urlpath)
                                try:
                                    downloader.download_file(url, datasetdir, filename)
                                except DownloadError as ex:
                                    with open(join(datasetdir, filename), 'w') as text_file:
                                        text_file.write(str(ex))
                            dataset.append('%s%s' % (repobase, dataset_name))
                        else:
                            dataset.append('')
                        rows.append(dataset)
                        logger.info('Added dataset %s' % dataset_name)
            sheet.update_values('A1', rows)
コード例 #22
0
def get_access(configuration, admininfo, downloader, scrapers=None):
    name = inspect.currentframe().f_code.co_name
    if scrapers and not any(scraper in name for scraper in scrapers):
        return list(), list(), list(), list(), list(), list(), list(), list(
        ), list()
    access_configuration = configuration['access_constraints']
    ranking_url = access_configuration['ranking_url']
    headers, rows = read_tabular(downloader, {
        'url': ranking_url,
        'headers': 1,
        'format': 'csv'
    })
    sheets = access_configuration['sheets']
    constraint_rankings = {x: dict() for x in sheets}
    nocountries_per_region = {'global': 0}
    top3counts = {'global': dict()}
    for region in admininfo.regions:
        nocountries_per_region[region] = 0
        top3counts[region] = dict()
    for row in rows:
        countryiso = row['iso3']
        nocountries_per_region['global'] += 1
        for region in admininfo.iso3_to_region_and_hrp.get(countryiso, list()):
            nocountries_per_region[region] += 1
        for sheet in sheets:
            if '%s_1' % sheet not in row:
                continue
            type_ranking = constraint_rankings.get(sheet, dict())
            for i in range(1, 4):
                constraint = row['%s_%d' % (sheet, i)]
                dict_of_lists_add(type_ranking, countryiso, constraint)
            constraint_rankings[sheet] = type_ranking
    data = dict()
    datasetinfo = {
        'dataset': access_configuration['dataset'],
        'headers': 1,
        'format': 'xlsx'
    }
    for sheet, sheetinfo in sheets.items():
        datasetinfo['sheet'] = sheetinfo['sheetname']
        headers, rows = read_hdx(downloader, datasetinfo)
        datasheet = data.get(sheet, dict())
        for row in rows:
            countryiso = Country.get_iso3_country_code(
                row[sheetinfo['isocol']])
            if countryiso not in admininfo.countryiso3s:
                continue
            countrydata = datasheet.get(countryiso, dict())
            score = countrydata.get('score', 0)
            newscore = row[sheetinfo['scorecol']]
            textcol = sheetinfo.get('textcol')
            if textcol:
                text = row[textcol]
                dict_of_lists_add(countrydata, 'text', (newscore, text))
                for region, top3countsregion in top3counts.items():
                    if region != 'global' and region not in admininfo.iso3_to_region_and_hrp.get(
                            countryiso, list()):
                        continue
                    top3countssheet = top3countsregion.get(sheet, dict())
                    if sheet == 'impact':
                        if newscore != 0:
                            top3countssheet[text] = top3countssheet.get(
                                text, 0) + 1
                    else:
                        if newscore == 3:
                            top3countssheet[text] = top3countssheet.get(
                                text, 0) + 1
                    top3countsregion[sheet] = top3countssheet
                weights = sheetinfo.get('weights')
                if weights:
                    weight = weights.get(text)
                    if weight:
                        newscore *= weight
                score += newscore
            else:
                dict_of_lists_add(countrydata, 'text', (newscore, newscore))
                for region, top3countsregion in top3counts.items():
                    if region != 'global' and region not in admininfo.iso3_to_region_and_hrp.get(
                            countryiso, list()):
                        continue
                    top3countssheet = top3countsregion.get(sheet, dict())
                    if newscore == 'yes':
                        top3countssheet[sheet] = top3countssheet.get(sheet,
                                                                     0) + 1
                    top3countsregion[sheet] = top3countssheet
                score = newscore
            countrydata['score'] = score
            datasheet[countryiso] = countrydata
        data[sheet] = datasheet
    gvaluedicts = [dict() for _ in range(7)]
    rvaluedicts = [dict() for _ in range(7)]
    for region, top3countsregion in top3counts.items():
        if region == 'global':
            valuedicts = gvaluedicts
        else:
            valuedicts = rvaluedicts
        for i, (sheet, top3countssheet) in enumerate(top3countsregion.items()):
            sortedcounts = sorted(top3countssheet,
                                  key=top3countssheet.get,
                                  reverse=True)
            texts = list()
            pcts = list()
            for text in sortedcounts[:3]:
                texts.append(text)
                pcts.append(
                    get_fraction_str(top3countssheet[text],
                                     nocountries_per_region[region]))
            if sheet == 'mitigation':
                valuedicts[i * 2][region] = pcts[0]
            else:
                valuedicts[i * 2][region] = '|'.join(texts)
                valuedicts[i * 2 + 1][region] = '|'.join(pcts)
    valuedicts = [dict() for _ in range(6)]
    severityscore = valuedicts[0]
    for i, sheet in enumerate(data):
        datasheet = data[sheet]
        for countryiso in datasheet:
            countrydata = datasheet[countryiso]
            ranked = sorted(countrydata['text'], reverse=True)
            top_value = ranked[0][0]
            texts = list()
            for value, text in countrydata['text']:
                if value == top_value:
                    if sheet == 'mitigation' or text in constraint_rankings[
                            sheet][countryiso]:
                        texts.append(text)
            valuedicts[i + 2][countryiso] = '|'.join(texts)
            if 'constraints' in sheet:
                score = severityscore.get(countryiso, 0)
                score += countrydata['score']
                severityscore[countryiso] = score
    ranges = access_configuration['category']
    severitycategory = valuedicts[1]
    for countryiso in severityscore:
        score = severityscore.get(countryiso)
        if score is None:
            severitycategory[countryiso] = None
            continue
        severitycategory[countryiso] = process_range(ranges, score)
    logger.info('Processed access')
    grheaders = [
        'Access Constraints Into', 'Access Constraints Into Pct',
        'Access Constraints Within', 'Access Constraints Within Pct',
        'Access Impact', 'Access Impact Pct', 'Mitigation Pct'
    ]
    headers = [
        'Access Severity Score', 'Access Severity Category',
        'Access Constraints Into', 'Access Constraints Within',
        'Access Impact', 'Mitigation'
    ]
    grhxltags = [
        '#access+constraints+into+desc', '#access+constraints+into+pct',
        '#access+constraints+within+desc', '#access+constraints+within+pct',
        '#access+impact+desc', '#access+impact+pct', '#access+mitigation+pct'
    ]
    hxltags = [
        '#severity+access+num+score', '#severity+access+category+num',
        '#access+constraints+into+desc', '#access+constraints+within+desc',
        '#access+impact+desc', '#access+mitigation+desc'
    ]
    return [grheaders, grhxltags], gvaluedicts, \
           [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url']) for hxltag in grhxltags], \
           [grheaders, grhxltags], rvaluedicts, \
           [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url']) for hxltag in grhxltags], \
           [headers, hxltags], valuedicts, \
           [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url']) for hxltag in hxltags]
コード例 #23
0
ファイル: ipc.py プロジェクト: orest-d/hdx-scraper-covid-viz
def get_ipc(configuration, admininfo, downloader, scrapers=None):
    name = inspect.currentframe().f_code.co_name
    if scrapers and not any(scraper in name for scraper in scrapers):
        return list(), list(), list(), list(), list()
    ipc_configuration = configuration['ipc']
    url = ipc_configuration['url']
    phases = ['3', '4', '5', 'P3+']
    national_phases = {phase: dict() for phase in phases}
    national_analysed = dict()
    subnational_phases = {phase: dict() for phase in phases}
    subnational_populations = {phase: dict() for phase in phases}
    for countryiso3 in admininfo.countryiso3s:
        countryiso2 = Country.get_iso2_from_iso3(countryiso3)
        data, adm1_names = get_data(downloader, url, countryiso2)
        if not data:
            continue
        row = data[0]
        for phase in phases:
            national_phases[phase][countryiso3] = row[
                f'Current Phase {phase} %']
        national_analysed[
            countryiso3] = f'{row["Current Population Analysed % of total county Pop"]:.03f}'
        for row in data[1:]:
            country = row['Country']
            if adm1_names:
                if country not in adm1_names:
                    continue
                adm1_name = country
            else:
                adm1_name = row['Area']
                if not adm1_name or adm1_name == country:
                    continue
            pcode, _ = admininfo.get_pcode(countryiso3, adm1_name, 'IPC')
            if not pcode:
                continue
            for phase in phases:
                population = row[f'Current Phase {phase} #']
                if population:
                    dict_of_lists_add(subnational_populations[phase], pcode,
                                      population)
                percentage = row[f'Current Phase {phase} %']
                if percentage:
                    dict_of_lists_add(subnational_phases[phase], pcode,
                                      percentage)
    for phase in phases:
        subnational_phase = subnational_phases[phase]
        for pcode in subnational_phase:
            percentages = subnational_phase[pcode]
            if len(percentages) == 1:
                subnational_phase[pcode] = get_fraction_str(percentages[0])
            else:
                populations = subnational_populations[phase][pcode]
                numerator = 0
                denominator = 0
                for i, percentage in enumerate(percentages):
                    population = populations[i]
                    numerator += population * percentage
                    denominator += population
                subnational_phase[pcode] = get_fraction_str(
                    numerator, denominator)
    logger.info('Processed IPC')
    dataset = Dataset.read_from_hdx(ipc_configuration['dataset'])
    date = get_date_from_dataset_date(dataset)
    headers = [f'FoodInsecurityIPC{phase}' for phase in phases]
    headers.append('FoodInsecurityIPCAnalysed')
    hxltags = [f'#affected+food+ipc+p{phase}+pct' for phase in phases[:-1]]
    hxltags.append('#affected+food+ipc+p3plus+pct')
    hxltags.append('#affected+food+ipc+analysed+pct')
    national_outputs = [national_phases[phase] for phase in phases]
    national_outputs.append(national_analysed)
    subnational_outputs = [subnational_phases[phase] for phase in phases]
    return [headers, hxltags], national_outputs, [headers[:-1], hxltags[:-1]], subnational_outputs, \
           [(hxltag, date, dataset['dataset_source'], dataset.get_hdx_url()) for hxltag in hxltags]
コード例 #24
0
    def process_datasets_datagrid(
        self,
        recipients: Optional[List[str]] = None,
        datasetclass: Type[Dataset] = Dataset,
    ) -> None:
        """Check for datasets that are candidates for the datagrid.

        Args:
            recipients (Optional[List[str]]): Recipient emails. Defaults to None.
            datasetclass (Type[Dataset]): Class with search_in_hdx. Defaults to Dataset.

        Returns:
            None
        """

        logger.info(
            "\n\n*** Checking for datasets that are candidates for the datagrid ***"
        )
        nodatasetsmsg = "No dataset candidates for the data grid {} found."
        startmsg = "Dear {},\n\nThe new datasets listed below are candidates for the data grid that you can investigate:\n\n"
        datagridstartmsg = "\nDatagrid {}:\n\n"
        subject = "Candidates for the datagrid"
        sheetname = "Datagrid"
        datasets_modified_yesterday = (
            self.databasequeries.get_datasets_modified_yesterday())
        emails = dict()
        for datagridname in self.sheet.datagrids:
            datasets = list()
            datagrid = self.sheet.datagrids[datagridname]
            for category in datagrid:
                if category in ["datagrid", "owner"]:
                    continue
                runyesterday = self.databasequeries.run_numbers[1][
                    1].isoformat()
                runtoday = self.databasequeries.run_numbers[0][1].isoformat()
                query = f'metadata_created:[{runyesterday}Z TO {runtoday}Z] AND {datagrid["datagrid"]} AND ({datagrid[category]})'
                datasetinfos = datasetclass.search_in_hdx(fq=query)
                for datasetinfo in datasetinfos:
                    dataset_id = datasetinfo["id"]
                    if dataset_id not in [
                            dataset["id"] for dataset in datasets
                    ]:
                        dataset = datasets_modified_yesterday.get(dataset_id)
                        if dataset is not None:
                            datasets.append(dataset)
            if len(datasets) == 0:
                logger.info(nodatasetsmsg.format(datagridname))
                continue
            owner = datagrid["owner"]
            datagridmsg = datagridstartmsg.format(datagridname)
            msg, htmlmsg = self.email.prepare_admin_emails(
                self.hdxhelper,
                datasets,
                datagridmsg,
                self.sheet,
                sheetname,
                dutyofficer=owner,
            )
            if msg is not None:
                ownertuple = (owner["name"], owner["email"])
                owneremails = emails.get(ownertuple, dict())
                for submsg in msg:
                    dict_of_lists_add(owneremails, "plain", submsg)
                for subhtmlmsg in htmlmsg:
                    dict_of_lists_add(owneremails, "html", subhtmlmsg)
                emails[ownertuple] = owneremails
        if recipients is None and len(self.sheet.datagridccs) != 0:
            users_to_email = self.sheet.datagridccs
        else:
            users_to_email = recipients
        for ownertuple in sorted(emails):
            owneremails = emails[ownertuple]
            owner = {"name": ownertuple[0], "email": ownertuple[1]}
            self.email.send_admin_summary(
                owner,
                users_to_email,
                owneremails,
                subject,
                startmsg,
                log=True,
                recipients_in_cc=True,
            )
コード例 #25
0
def generate_dataset_and_showcases(
    downloader, countryiso, indicator_metadata, countryalias
):
    """Parse json of the form:
    {'id': '1482', 'title': 'The spatial distribution of population in 2000,
        Zimbabwe', 'desc': 'Estimated total number of people per grid-cell...',  'doi': '10.5258/SOTON/WP00645',
        'date': '2018-11-01', 'popyear': '2000', 'citation': 'WorldPop',
        'data_file': 'GIS/Population/Global_2000_2020/2000/ZWE/zwe_ppp_2000.tif', 'archive': 'N', 'public': 'Y',
        'source': 'WorldPop, University of Southampton, UK', 'data_format': 'Geotiff', 'author_email': '*****@*****.**',
        'author_name': 'WorldPop', 'maintainer_name': 'WorldPop', 'maintainer_email': '*****@*****.**',
        'project': 'Population', 'category': 'Global per country 2000-2020', 'gtype': 'Population',
        'continent': 'Africa', 'country': 'Zimbabwe', 'iso3': 'ZWE',
        'files': ['ftp://ftp.worldpop.org.uk/GIS/Population/Global_2000_2020/2000/ZWE/zwe_ppp_2000.tif'],
        'url_img': 'https://www.worldpop.org/tabs/gdata/img/1482/zwe_ppp_wpgp_2000_Image.png',
        'organisation': 'WorldPop, University of Southampton, UK, www.worldpop.org',
        'license': 'https://www.worldpop.org/data/licence.txt',
        'url_summary': 'https://www.worldpop.org/geodata/summary?id=1482'}
    """
    allmetadata = dict()
    for subalias in countryalias:
        urls = countryalias[subalias]
        allmetadata_subalias = allmetadata.get(subalias, list())
        for url in urls:
            downloader.download(url)
            json = downloader.get_json()
            data = json["data"]
            if isinstance(data, list):
                allmetadata_subalias.extend(data)
            else:
                allmetadata_subalias.append(data)
        allmetadata[subalias] = allmetadata_subalias
    allmetadatavalues = list(allmetadata.values())
    lastmetadata = allmetadatavalues[0][-1]
    indicator_title = indicator_metadata["title"]
    if countryiso == "World":
        countryname = countryiso
    else:
        countryname = Country.get_country_name_from_iso3(countryiso)
        if not countryname:
            logger.exception(f"ISO3 {countryiso} not recognised!")
            return None, None
    title = f"{countryname} - {indicator_title}"
    slugified_name = slugify(f"WorldPop {indicator_title} for {countryname}").lower()
    logger.info(f"Creating dataset: {title}")
    licence_url = lastmetadata[
        "license"
    ].lower()  # suggest that they remove license and rename this field license
    downloader.download(licence_url)
    licence = downloader.get_text()
    methodologies = list()
    url_imgs = list()
    for allmetadatavalue in allmetadatavalues:
        lastallmetadatavalue = allmetadatavalue[-1]
        methodologies.append(lastallmetadatavalue["desc"])
        url_img = lastallmetadatavalue["url_img"]
        if not url_img:
            for lastallmetadatavalue in reversed(allmetadatavalue[:-1]):
                url_img = lastallmetadatavalue["url_img"]
                if url_img:
                    break
        url_imgs.append(url_img)
    methodology = get_matching_then_nonmatching_text(methodologies)
    dataset = Dataset(
        {
            "name": slugified_name,
            "title": title,
            "notes": f"{indicator_metadata['desc']}  \nData for earlier dates is available directly from WorldPop.  \n  \n{lastmetadata['citation']}",
            "methodology": "Other",
            "methodology_other": methodology,
            "dataset_source": lastmetadata["source"],
            "license_id": "hdx-other",
            "license_other": licence,
            "private": False,
        }
    )
    dataset.set_maintainer("37023db4-a571-4f28-8d1f-15f0353586af")
    dataset.set_organization("3f077dff-1d05-484d-a7c2-4cb620f22689")
    dataset.set_expected_update_frequency("Every year")
    dataset.set_subnational(True)
    try:
        dataset.add_other_location(countryiso)
    except HDXError as e:
        logger.exception(f"{countryname} has a problem! {e}")
        return None, None

    tags = [indicator_metadata["name"].lower(), "geodata"]
    dataset.add_tags(tags)

    earliest_year = 10000
    latest_year = 0
    resources_dict = dict()
    for subalias in allmetadata:
        for metadata in allmetadata[subalias]:
            if metadata["public"].lower() != "y":
                continue
            year = metadata["popyear"]
            if not year:
                year = metadata["date"][:4]
            year = int(year)
            if year > latest_year:
                latest_year = year
            if year < earliest_year:
                earliest_year = year
            for url in sorted(metadata["files"], reverse=True):
                resource_name = url[url.rfind("/") + 1 :]
                description = metadata["title"]
                if not re.match(r".*([1-3][0-9]{3})", resource_name):
                    resource_parts = resource_name.split(".")
                    resource_name = f"{resource_parts[0]}_{year}"
                    if len(resource_parts) >= 2:
                        resource_name = f"{resource_name}.{resource_parts[1]}"
                    description = f"{description} in {year}"
                resource = {
                    "name": resource_name,
                    "format": metadata["data_format"],
                    "url": url,
                    "description": description,
                }
                dict_of_lists_add(resources_dict, year, resource)
    if not resources_dict:
        logger.error(f"{title} has no data!")
        return None, None
    for year in sorted(resources_dict.keys(), reverse=True)[:5]:  # Just get last 5 years of data
        for resource in resources_dict[year]:
            dataset.add_update_resource(resource)

    dataset.set_dataset_year_range(earliest_year, latest_year)

    showcases = list()
    for i, url_img in enumerate(url_imgs):
        if not url_img:
            continue
        allmetadatavalue = allmetadatavalues[i][-1]
        url_summary = allmetadatavalue["url_summary"]
        if i == 0:
            name = f"{slugified_name}-showcase"
        else:
            name = f"{slugified_name}-{i + 1}-showcase"
        showcase = Showcase(
            {
                "name": name,
                "title": f"WorldPop {countryname} {indicator_title} Summary Page",
                "notes": f"Summary for {allmetadatavalue['category']} - {countryname}",
                "url": url_summary,
                "image_url": url_img,
            }
        )
        showcase.add_tags(tags)
        showcases.append(showcase)
    return dataset, showcases
コード例 #26
0
    def get_regional(self, regionlookup, national_headers, national_columns, population_lookup=None, *args):
        if population_lookup is None:
            process_cols = self.region_config['process_cols']
        else:
            process_cols = {'Population': {'action': 'sum'}}
        desired_headers = process_cols.keys()
        message = 'Regional header {} not found in national headers!'
        regional_headers, regional_columns = self.get_headers_and_columns(desired_headers, national_headers,
                                                                          national_columns, message)
        valdicts = list()
        for i, header in enumerate(regional_headers[0]):
            valdict = dict()
            valdicts.append(valdict)
            process_info = process_cols[header]
            column = regional_columns[i]
            for countryiso in column:
                for region in regionlookup.iso3_to_region_and_hrp[countryiso]:
                    if not self.should_process(process_info, region, countryiso):
                        continue
                    dict_of_lists_add(valdict, region, column[countryiso])
            self.process(process_info, valdicts, regional_headers, i)

        if population_lookup is None:
            multi_cols = self.region_config.get('multi_cols', list())
            for header in multi_cols:
                multi_info = multi_cols[header]
                input_headers = multi_info['headers']
                ignore = False
                for input_header in input_headers:
                    if input_header not in national_headers[0]:
                        logger.error(message.format(input_header))
                        ignore = True
                        break
                if ignore:
                    continue
                regional_headers[0].append(header)
                regional_headers[1].append(multi_info['hxltag'])
                found_region_countries = set()
                valdict = dict()
                valdicts.append(valdict)
                for i, orig_header in enumerate(input_headers):
                    index = national_headers[0].index(orig_header)
                    column = national_columns[index]
                    for countryiso in column:
                        for region in regionlookup.iso3_to_region_and_hrp[countryiso]:
                            if not self.should_process(multi_info, region, countryiso):
                                continue
                            key = f'{region}|{countryiso}'
                            if key in found_region_countries:
                                continue
                            value = column[countryiso]
                            if value:
                                found_region_countries.add(key)
                                dict_of_lists_add(valdict, region, value)
                self.process(multi_info, valdicts, regional_headers, len(regional_headers[0]) - 1)

        for arg in args:
            gheaders, gvaldicts = arg
            if gheaders:
                for i, header in enumerate(gheaders[1]):
                    try:
                        j = regional_headers[1].index(header)
                    except ValueError:
                        continue
                    valdicts[j].update(gvaldicts[i])

        add_population(population_lookup, regional_headers, valdicts)
        logger.info('Processed regional')
        return regional_headers, valdicts
コード例 #27
0
 def add_other_requirements_and_funding(iso3, name, req, fund, pct):
     dict_of_lists_add(other_planname, iso3, name)
     if req:
         dict_of_lists_add(other_requirements, iso3, req)
     else:
         dict_of_lists_add(other_requirements, iso3, None)
     if fund and req:
         dict_of_lists_add(other_funding, iso3, fund)
         dict_of_lists_add(other_percentage, iso3, pct)
     else:
         dict_of_lists_add(other_funding, iso3, None)
         dict_of_lists_add(other_percentage, iso3, None)
コード例 #28
0
ファイル: ipc.py プロジェクト: OCHA-DAP/hdx-scraper-covid-viz
def get_ipc(configuration,
            today,
            gho_countries,
            adminone,
            downloader,
            scrapers=None):
    name = inspect.currentframe().f_code.co_name
    if scrapers and not any(scraper in name for scraper in scrapers):
        return list(), list(), list(), list(), list()
    ipc_configuration = configuration['ipc']
    url = ipc_configuration['url']
    phases = ['3', '4', '5', 'P3+']
    projections = ['Current', 'First Projection', 'Second Projection']
    national_populations = {phase: dict() for phase in phases}
    national_analysed = dict()
    national_period = dict()
    national_start = dict()
    national_end = dict()
    subnational_populations = {phase: dict() for phase in phases}
    for countryiso3 in gho_countries:
        countryiso2 = Country.get_iso2_from_iso3(countryiso3)
        data, adm1_names = get_data(downloader, url, today, countryiso2)
        if not data:
            continue
        row = data[0]
        analysis_period, start, end = get_period(today, row, projections)
        for phase in phases:
            national_populations[phase][countryiso3] = row[
                f'{analysis_period} Phase {phase} #']
        national_analysed[countryiso3] = row['Current Population Analysed #']
        national_period[countryiso3] = analysis_period
        national_start[countryiso3] = start
        national_end[countryiso3] = end
        for row in data[1:]:
            country = row['Country']
            if adm1_names:
                if country not in adm1_names:
                    continue
                adm1_name = country
            else:
                adm1_name = row['Area']
                if not adm1_name or adm1_name == country:
                    continue
            pcode, _ = adminone.get_pcode(countryiso3, adm1_name, 'IPC')
            if not pcode:
                continue
            for phase in phases:
                population = row[f'{analysis_period} Phase {phase} #']
                if population:
                    dict_of_lists_add(subnational_populations[phase], pcode,
                                      population)
    for phase in phases:
        subnational_population = subnational_populations[phase]
        for pcode in subnational_population:
            populations = subnational_population[pcode]
            if len(populations) == 1:
                subnational_population[pcode] = populations[0]
            else:
                population_in_pcode = 0
                for i, population in enumerate(populations):
                    population_in_pcode += population
                subnational_population[pcode] = population_in_pcode
    logger.info('Processed IPC')
    dataset = Dataset.read_from_hdx(ipc_configuration['dataset'])
    date = get_date_from_dataset_date(dataset, today=today)
    headers = [f'FoodInsecurityIPC{phase}' for phase in phases]
    headers.append('FoodInsecurityIPCAnalysedNum')
    headers.append('FoodInsecurityIPCAnalysisPeriod')
    headers.append('FoodInsecurityIPCAnalysisPeriodStart')
    headers.append('FoodInsecurityIPCAnalysisPeriodEnd')
    hxltags = [f'#affected+food+ipc+p{phase}+num' for phase in phases[:-1]]
    hxltags.append('#affected+food+ipc+p3plus+num')
    hxltags.append('#affected+food+ipc+analysed+num')
    hxltags.append('#date+ipc+period')
    hxltags.append('#date+ipc+start')
    hxltags.append('#date+ipc+end')
    national_outputs = [national_populations[phase] for phase in phases]
    national_outputs.append(national_analysed)
    national_outputs.append(national_period)
    national_outputs.append(national_start)
    national_outputs.append(national_end)
    subnational_outputs = [subnational_populations[phase] for phase in phases]
    return [headers, hxltags], national_outputs, [headers[:-4], hxltags[:-4]], subnational_outputs, \
           [(hxltag, date, dataset['dataset_source'], dataset.get_hdx_url()) for hxltag in hxltags]
コード例 #29
0
def get_regional(configuration, national_headers, national_columns, admininfo):
    regional_config = configuration['regional']
    val_fns = regional_config['val_fns']
    headers = val_fns.keys()
    regional_headers = [list(), list()]
    regional_columns = list()
    for i, header in enumerate(national_headers[0][3:]):
        if header not in headers:
            continue
        regional_headers[0].append(header)
        regional_headers[1].append(national_headers[1][3 + i])
        regional_columns.append(national_columns[i])
    valdicts = list()
    for i, header in enumerate(regional_headers[0]):
        valdict = dict()
        valdicts.append(valdict)
        action = val_fns[header]
        column = regional_columns[i]
        for countryiso in column:
            for region in admininfo.iso3_to_region_and_hrp[countryiso]:
                dict_of_lists_add(valdict, region, column[countryiso])
        if action == 'sum':
            for region, valuelist in valdict.items():
                total = ''
                for valuestr in valuelist:
                    if valuestr:
                        value = get_numeric(valuestr)
                        if value:
                            if total == '':
                                total = value
                            else:
                                total += value
                if isinstance(total, float):
                    valdict[region] = number_format(total)
                else:
                    valdict[region] = total
        elif action == 'range':
            for region, valuelist in valdict.items():
                min = sys.maxsize
                max = -min
                for valuestr in valuelist:
                    if valuestr:
                        value = get_numeric(valuestr)
                        if value > max:
                            max = value
                        if value < min:
                            min = value
                if min == sys.maxsize or max == -sys.maxsize:
                    valdict[region] = ''
                else:
                    if isinstance(max, float):
                        max = number_format(max)
                    if isinstance(min, float):
                        min = number_format(min)
                    valdict[region] = '%s-%s' % (str(min), str(max))
        else:
            for region, valuelist in valdict.items():
                toeval = action
                for j in range(i):
                    value = valdicts[j].get(region, '')
                    if value == '':
                        value = None
                    toeval = toeval.replace(regional_headers[0][j], str(value))
                valdict[region] = eval(toeval)
    logger.info('Processed regional')
    return regional_headers, valdicts
コード例 #30
0
def add_food_prices(configuration,
                    today,
                    countryiso3s,
                    retriever,
                    basic_auths,
                    scrapers=None):
    name = 'food_prices'
    if scrapers and not any(scraper in name for scraper in scrapers):
        return list(), list(), list()
    datasetinfo = configuration[name]
    read_hdx_metadata(datasetinfo, today=today)
    base_url = datasetinfo['base_url']
    if retriever.use_saved:
        headers = None
    else:
        basic_auth = basic_auths[name]
        token_downloader = Download(basic_auth=basic_auth)
        token_downloader.download(
            f'{base_url}/token',
            post=True,
            parameters={'grant_type': 'client_credentials'})
        access_token = token_downloader.get_json()['access_token']
        headers = {
            'Accept': 'application/json',
            'Authorization': f'Bearer {access_token}'
        }

    def get_list(endpoint, countryiso3, startdate=None):
        url = f'{base_url}/{endpoint}'
        filename = url.split('/')[-2]
        page = 1
        all_data = []
        data = None
        while data is None or len(data) > 0:
            parameters = {'CountryCode': countryiso3, 'page': page}
            if startdate:
                parameters['startDate'] = startdate
            try:
                json = retriever.retrieve_json(
                    url,
                    f'{filename}_{countryiso3}_{page}.json',
                    f'{filename} for {countryiso3} page {page}',
                    False,
                    parameters=parameters,
                    headers=headers)
            except FileNotFoundError:
                json = {'items': list()}
            data = json['items']
            all_data.extend(data)
            page = page + 1
        return all_data

    six_months_ago = today - relativedelta(months=6)
    ratios = dict()
    category_id_weights = {1: 2, 2: 4, 3: 4, 4: 1, 5: 3, 6: 0.5, 7: 0.5}
    for countryiso3 in countryiso3s:
        logger.info(f'Processing {countryiso3}')
        commodities = get_list('vam-data-bridges/1.1.0/Commodities/List',
                               countryiso3)
        if not commodities:
            logger.info(f'{countryiso3} has no commodities!')
            continue
        commodity_id_to_category_id = {
            x['id']: x['categoryId']
            for x in commodities
        }
        alps = get_list('vam-data-bridges/1.1.0/MarketPrices/Alps',
                        countryiso3, six_months_ago)
        if not alps:
            logger.info(f'{countryiso3} has no ALPS!')
            continue
        yearmonth_rows = dict()
        for row in alps:
            analysis_value_price_flag = row['analysisValuePriceFlag']
            if analysis_value_price_flag == 'forecast':
                continue
            commodity_id = row['commodityID']
            category_id = commodity_id_to_category_id.get(commodity_id)
            if not category_id or category_id >= 8:
                continue
            row['categoryId'] = category_id
            yearmonth = f'{row["commodityPriceDateYear"]}/{row["commodityPriceDateMonth"]}'
            dict_of_lists_add(yearmonth_rows, yearmonth, row)
        yearmonths = yearmonth_rows.keys()
        if len(yearmonths) == 0:
            logger.info(f'{countryiso3} has no values!')
            continue
        latest_yearmonth = max(yearmonths)
        commodities_per_market = dict()
        commodities_per_market_crisis = dict()
        for row in yearmonth_rows[latest_yearmonth]:
            market_id = row['marketID']
            category_id = row['categoryId']
            weighted_value = category_id_weights[category_id]
            commodities_per_market[market_id] = commodities_per_market.get(
                market_id, 0) + weighted_value
            pewivalue = row['analysisValuePewiValue']
            if pewivalue >= 1.0:
                commodities_per_market_crisis[
                    market_id] = commodities_per_market_crisis.get(
                        market_id, 0) + weighted_value
        country_ratio = 0
        for market_id in commodities_per_market:
            market_ratio = commodities_per_market_crisis.get(
                market_id, 0) / commodities_per_market[market_id]
            country_ratio += market_ratio
        country_ratio /= len(commodities_per_market)
        ratios[countryiso3] = number_format(country_ratio,
                                            trailing_zeros=False)
    hxltag = '#value+food+num+ratio'
    logger.info('Processed WFP')
    return [['Food Prices Ratio'], [hxltag]
            ], [ratios], [(hxltag, datasetinfo['date'], datasetinfo['source'],
                           datasetinfo['source_url'])]