コード例 #1
0
    def get_dates_from_title(cls, title):
        # type: (str) -> Tuple[str,List[Tuple[datetime,datetime]]]
        """
        Get dataset dates (start and end dates in a list) from title and clean title of dates

        Args:
            title (str): Title to get date from and clean

        Returns:
            Tuple[str,List[Tuple[datetime,datetime]]]: Cleaned title, list of start and end dates

        """
        ranges = list()
        ignore_wrong_years = list()
        for match in cls.YEAR_RANGE_PATTERN.finditer(title):
            first_year, first_month, second_year = cls.get_month_year_in_slash_range(match, ignore_wrong_years)
            if first_year is None:
                continue
            if first_month is None:
                first_month = 1
            startdate = parse_date('%d-%d-01' % (first_year, first_month), '%Y-%m-%d', zero_time=True)
            enddate = parse_date('%s-12-31' % match.group(5), '%Y-%m-%d', zero_time=True)
            ranges.append((startdate, enddate))
            newtitle = remove_string(title, match.group(0))
            logger.info('Removing date range from title: %s -> %s' % (title, newtitle))
            title = newtitle

        for match in cls.YEAR_RANGE_PATTERN2.finditer(title):
            first_year, first_month, second_year = cls.get_month_year_in_slash_range(match, ignore_wrong_years)
            if first_year is None or second_year is None:
                continue
            startdate = parse_date('%d-01-01' % first_year, '%Y-%m-%d', zero_time=True)
            enddate = parse_date('%d-12-31' % second_year, '%Y-%m-%d', zero_time=True)
            ranges.append((startdate, enddate))
            newtitle = remove_string(title, match.group(0))
            logger.info('Removing date range from title: %s -> %s' % (title, newtitle))
            title = newtitle

        title = cls.fuzzy_match_dates_in_title(title, ranges, ignore_wrong_years)

        for match in cls.WORD_RIGHT_BRACKET_PATTERN.finditer(title):
            word = match.group(2)
            if word in cls.DATE_INTRO_WORDS:
                title = title.replace(match.group(0), ')')

        for match in cls.EMPTY_BRACKET_PATTERN.finditer(title):
            title = title.replace(match.group(0), ' ')
        title = remove_end_characters(title, '%s%s' % (PUNCTUATION_MINUS_BRACKETS, whitespace))
        title = remove_from_end(title, ['as of'] + cls.DATE_INTRO_WORDS, 'Removing - from title: %s -> %s')
        return title, sorted(ranges)
コード例 #2
0
    def __init__(self, level, datasetinfo, headers, maxdateonly=True):
        if isinstance(level, str):
            if level == 'global':
                level = None
            elif level == 'national':
                level = 0
            else:
                level = 1
        self.level = level
        self.datecol = datasetinfo.get('date_col')
        self.datetype = datasetinfo.get('date_type')
        if self.datetype:
            if self.datetype == 'date':
                date = parse_date('1900-01-01')
            else:
                date = 0
        else:
            date = 0
        self.admininfo = AdminInfo.get()
        self.admcols = datasetinfo.get('adm_cols', list())
        if self.level is None:
            self.maxdate = date
        else:
            if self.level > len(self.admcols):
                raise ValueError('No admin columns specified for required level!')
            self.maxdates = {adm: date for adm in self.admininfo.adms[self.level]}

        self.maxdateonly = maxdateonly
        self.flatteninfo = datasetinfo.get('flatten')
        self.headers = headers
コード例 #3
0
 def get_transaction(configuration, dtransaction, activity_identifier):
     # We're not interested in transactions that have no value
     if not dtransaction.value:
         return None
     # We're only interested in some transaction types
     transaction_type_info = configuration["transaction_type_info"].get(
         dtransaction.type)
     if not transaction_type_info:
         return None
     # We're not interested in transactions that can't be valued
     try:
         # Use value-date falling back on date
         date = dtransaction.value_date
         if not date:
             date = dtransaction.date
         # Convert the transaction value to USD
         currency = dtransaction.currency
         if currency is None:
             logger.error(
                 f"Activity {activity_identifier} transaction with value {dtransaction.value} currency error!"
             )
             return None
         value = Currency.get_historic_value_in_usd(dtransaction.value,
                                                    currency,
                                                    parse_date(date))
     except (ValueError, CurrencyError):
         logger.exception(
             f"Activity {activity_identifier} transaction with value {dtransaction.value} USD conversion failed!"
         )
         return None
     return Transaction(transaction_type_info, dtransaction, value)
コード例 #4
0
 def check_date(date):
     nonlocal conflict_start_date, start_date_in_conflict
     if start_date_in_conflict:
         return
     if not date:
         return
     start_date = parse_date(date)
     if start_date >= conflict_start_date:
         start_date_in_conflict = True
コード例 #5
0
    def do_set_value(self, row, scrapername=None):
        adms = [None for _ in range(len(self.admcols))]

        def get_adm(admcol, i):
            match = template.search(admcol)
            if match:
                template_string = match.group()
                admcol = self.headers[int(template_string[2:-2])]
            adm = row[admcol]
            if not adm:
                return False
            adms[i] = row[admcol].strip()
            return self.admininfo.get_adm(adms, i, scrapername)

        for i, admcol in enumerate(self.admcols):
            if admcol is None:
                continue
            if isinstance(admcol, str):
                admcol = [admcol]
            for admcl in admcol:
                exact = get_adm(admcl, i)
                if adms[i] and exact:
                    break
            if not adms[i]:
                return None, None
        if self.datecol:
            date = row[self.datecol]
            if self.datetype == 'int':
                date = int(date)
            else:
                if not isinstance(date, datetime):
                    date = parse_date(date)
                date = date.replace(tzinfo=None)
            if self.level is None:
                if self.maxdateonly:
                    if date < self.maxdate:
                        return None, None
                    self.maxdate = date
            else:
                if self.maxdateonly:
                    if date < self.maxdates[adms[self.level]]:
                        return None, None
                self.maxdates[adms[self.level]] = date
        else:
            date = None
        if self.level is None:
            return 'global', date
        return adms[self.level], date
コード例 #6
0
def get_monthly_report_source(configuration):
    monthly_report_configuration = configuration['monthly_report']
    dataset = monthly_report_configuration['dataset']
    resource = monthly_report_configuration['resource']
    if isinstance(dataset, str):
        dataset = Dataset.read_from_hdx(dataset)
        resource_name = resource
        resource = None
        for res in dataset.get_resources():
            if res['name'] == resource_name:
                resource = res
                break
        if not resource:
            raise ValueError('No monthly report resource found!')
    last_modified = parse_date(resource['last_modified']).isoformat()[:10]

    return monthly_report_configuration['hxltag'], last_modified, dataset[
        'dataset_source'], resource['url']
コード例 #7
0
def get_latest_columns(date, base_url, countryiso3s, input_cols, downloader):
    countries_index = download_data(date, base_url, countryiso3s, input_cols, downloader)
    valuedicts = [dict() for _ in input_cols]
    crisis_types = dict()
    max_date = default_date
    for countryiso3, country_data in countries_index.items():
        crises_types = country_data['ind_agg_type'].get('Aggregated')
        if not crises_types:
            crises_types = country_data['ind_agg_type'].get('Individual')
        type_of_crisis = crises_types[0]
        crisis_types[countryiso3] = type_of_crisis
        crisis = country_data['crises'][type_of_crisis]
        for i, input_col in enumerate(input_cols):
            val, last_updated = crisis[input_col]
            valuedicts[i][countryiso3] = val
            date = parse_date(last_updated)
            if date > max_date:
                max_date = date
    return valuedicts, crisis_types, max_date
コード例 #8
0
def get_unhcr(configuration,
              today,
              today_str,
              countryiso3s,
              downloader,
              scrapers=None):
    name = inspect.currentframe().f_code.co_name
    if scrapers and not any(scraper in name for scraper in scrapers):
        return list(), list(), list()
    iso3tocode = downloader.download_tabular_key_value(
        join('config', 'UNHCR_geocode.csv'))
    unhcr_configuration = configuration['unhcr']
    base_url = unhcr_configuration['url']
    population_collections = unhcr_configuration['population_collections']
    exclude = unhcr_configuration['exclude']
    valuedicts = [dict(), dict()]
    for countryiso3 in countryiso3s:
        if countryiso3 in exclude:
            continue
        code = iso3tocode.get(countryiso3)
        if not code:
            continue
        for population_collection in population_collections:
            r = downloader.download(base_url % (population_collection, code))
            data = r.json()['data'][0]
            individuals = data['individuals']
            if individuals is None:
                continue
            date = data['date']
            if parse_date(date) < today - relativedelta(years=2):
                continue
            existing_individuals = valuedicts[0].get(countryiso3)
            if existing_individuals is None:
                valuedicts[0][countryiso3] = int(individuals)
                valuedicts[1][countryiso3] = date
            else:
                valuedicts[0][countryiso3] += int(individuals)
    logger.info('Processed UNHCR')
    hxltags = ['#affected+refugees', '#affected+date+refugees']
    return [['TotalRefugees', 'TotalRefugeesDate'], hxltags], valuedicts, [
        (hxltag, today_str, 'UNHCR', unhcr_configuration['source_url'])
        for hxltag in hxltags
    ]
コード例 #9
0
    def __init__(self, level, datasetinfo, headers, indicatorcols, maxdateonly=True):
        if isinstance(level, str):
            if level == 'global':
                level = None
            elif level == 'national':
                level = 0
            else:
                level = 1
        self.level = level
        self.datecol = datasetinfo.get('date_col')
        self.datetype = datasetinfo.get('date_type')
        if self.datetype:
            if self.datetype == 'date':
                date = parse_date('1900-01-01')
            else:
                date = 0
        else:
            date = 0
        self.maxdate = date
        date_condition = datasetinfo.get('date_condition')
        if date_condition is not None:
            for col in datasetinfo['val_cols']:
                date_condition = date_condition.replace(col, f"row['{col}']")
        self.date_condition = date_condition
        self.admininfo = AdminInfo.get()
        self.admcols = datasetinfo.get('adm_cols', list())
        self.admexact = datasetinfo.get('adm_exact', False)
        self.indicatorcols = indicatorcols
        if self.level is None:
            self.maxdates = {i: date for i, _ in enumerate(indicatorcols)}
        else:
            if self.level > len(self.admcols):
                raise ValueError('No admin columns specified for required level!')
            self.maxdates = {i: {adm: date for adm in self.admininfo.adms[self.level]} for i, _ in enumerate(indicatorcols)}

        self.maxdateonly = maxdateonly
        self.flatteninfo = datasetinfo.get('flatten')
        self.headers = headers
        self.filters = dict()
        self.get_external_filter(datasetinfo)
コード例 #10
0
 def test_get_indicators(self, configuration, folder):
     with temp_dir('TestCovidViz',
                   delete_on_success=True,
                   delete_on_failure=False) as tempdir:
         with Download(user_agent='test') as downloader:
             retriever = Retrieve(downloader,
                                  tempdir,
                                  folder,
                                  tempdir,
                                  save=False,
                                  use_saved=True)
             tabs = configuration['tabs']
             noout = NoOutput(tabs)
             jsonout = JsonOutput(configuration, tabs)
             outputs = {'gsheets': noout, 'excel': noout, 'json': jsonout}
             today = parse_date('2021-05-03')
             countries_to_save = get_indicators(
                 configuration,
                 today,
                 retriever,
                 outputs,
                 tabs,
                 scrapers=[
                     'ifi', 'who_global', 'who_national', 'who_subnational',
                     'who_covid', 'sadd', 'covidtests', 'cadre_harmonise',
                     'access', 'food_prices'
                 ],
                 use_live=False)
             filepaths = jsonout.save(tempdir,
                                      countries_to_save=countries_to_save)
             assert filecmp.cmp(filepaths[0],
                                join(folder, 'test_scraper_all.json'))
             assert filecmp.cmp(filepaths[1],
                                join(folder, 'test_scraper.json'))
             assert filecmp.cmp(filepaths[2],
                                join(folder, 'test_scraper_daily.json'))
             assert filecmp.cmp(
                 filepaths[3], join(folder,
                                    'test_scraper_covidseries.json'))
コード例 #11
0
    def exclude_dactivity(cls, dactivity):
        if cls.has_desired_scope(dactivity.humanitarian_scopes):
            return False
        if not dactivity.humanitarian:
            return True
        if dactivity.activity_status != "2":
            return True
        conflict_start_date = parse_date("2022-02-24")
        relevant_countries = ("UA", "PL", "HU", "SK", "RO", "MD", "BY", "RU")
        start_date_in_conflict = False
        country_in_list = False
        text_in_narrative = False

        def check_date(date):
            nonlocal conflict_start_date, start_date_in_conflict
            if start_date_in_conflict:
                return
            if not date:
                return
            start_date = parse_date(date)
            if start_date >= conflict_start_date:
                start_date_in_conflict = True

        def check_countries(countries):
            nonlocal relevant_countries, country_in_list
            if country_in_list:
                return
            if not countries:
                return
            for country in countries:
                if country.code in relevant_countries:
                    country_in_list = True
                    return

        def check_narratives(title_or_desc):
            nonlocal text_in_narrative
            if text_in_narrative:
                return
            if not title_or_desc:
                return
            for lang, text in title_or_desc.narratives.items():
                text_lower = text.lower()
                if "ukraine" in text_lower or "ukrainian" in text_lower:
                    text_in_narrative = True
                    return

        check_date(dactivity.start_date_actual)
        check_countries(dactivity.recipient_countries)
        check_narratives(dactivity.title)
        check_narratives(dactivity.description)

        for dtransaction in dactivity.transactions:
            check_date(dtransaction.date)
            check_date(dtransaction.value_date)
            check_countries(dtransaction.recipient_countries)
            check_narratives(dtransaction.description)

        if not country_in_list:
            return True
        if not start_date_in_conflict:
            return True
        if not text_in_narrative:
            return True
        return False
コード例 #12
0
def run_scrapers(datasets,
                 countryiso3s,
                 adminone,
                 level,
                 maindownloader,
                 basic_auths=dict(),
                 today=None,
                 scrapers=None,
                 population_lookup=None,
                 **kwargs):
    # type: (Dict, List[str], AdminOne, str, Download, Dict[str,str], Optional[datetime], Optional[List[str]], Dict[str,int], Any) -> Dict
    """Runs all mini scrapers given in configuration and returns headers, values and sources.

    Args:
        datasets (Dict): Configuration for mini scrapers
        countryiso3s (List[str]): List of ISO3 country codes to process
        adminone (AdminOne): AdminOne object from HDX Python Country library that handles processing of admin level 1
        level (str): Can be global, national or subnational
        maindownloader (Download): Download object for downloading files
        basic_auths (Dict[str,str]): Dictionary of basic authentication information
        today (Optional[datetime]): Value to use for today. Defaults to None (datetime.now()).
        scrapers (Optional[List[str]])): List of mini scraper names to process
        population_lookup (Dict[str,int]): Dictionary from admin code to population
        **kwargs: Variables to use when evaluating template arguments in urls

    Returns:
        Dict: Dictionary of output containing output headers, values and sources
    """
    results = {
        'headers': [list(), list()],
        'values': list(),
        'sources': list()
    }
    now = datetime.now()
    for name in datasets:
        if scrapers:
            if not any(scraper in name for scraper in scrapers):
                continue
        else:
            if name == 'population':
                continue
        logger.info('Processing %s' % name)
        basic_auth = basic_auths.get(name)
        if basic_auth is None:
            downloader = maindownloader
        else:
            downloader = Download(basic_auth=basic_auth,
                                  rate_limit={
                                      'calls': 1,
                                      'period': 0.1
                                  })
        datasetinfo = datasets[name]
        datasetinfo['name'] = name
        headers, iterator = read(downloader,
                                 datasetinfo,
                                 today=today,
                                 **kwargs)
        if 'source_url' not in datasetinfo:
            datasetinfo['source_url'] = datasetinfo['url']
        if 'date' not in datasetinfo or datasetinfo.get(
                'force_date_today', False):
            today_str = kwargs.get('today_str')
            if today_str:
                today = parse_date(today_str)
            else:
                if not today:
                    today = now
                today_str = today.strftime('%Y-%m-%d')
            datasetinfo['date'] = today_str
        _run_scraper(countryiso3s, adminone, level, today, name, datasetinfo,
                     headers, iterator, population_lookup, results)
        if downloader != maindownloader:
            downloader.close()
        if population_lookup is not None:
            add_population(population_lookup, results['headers'],
                           results['values'])
    return results
コード例 #13
0
    def test_save(self, configuration, fixtures, hxltags):
        with temp_dir('TestScraperSave', delete_on_success=True, delete_on_failure=False) as tempdir:
            with Download(user_agent='test') as downloader:
                tabs = configuration['tabs']
                sheetname = list(tabs.values())[0]
                noout = NoOutput(tabs)
                excelout = ExcelOutput(join(tempdir, 'test_output.xlsx'), tabs, tabs)
                gsheet_auth = getenv('GSHEET_AUTH')
                if not gsheet_auth:
                    raise ValueError('No gsheet authorisation supplied!')
                googleout = GoogleSheets(configuration, gsheet_auth, None, tabs, tabs)
                jsonout = JsonOutput(configuration, tabs)
                output = [list(hxltags.keys()), list(hxltags.values()), ['AFG', 'Afghanistan', 38041754]]

                # won't do anything as wrong tab name
                excelout.update_tab('lala', output, hxltags=hxltags)
                googleout.update_tab('lala', output, hxltags=hxltags)
                jsonout.update_tab('lala', output, hxltags=hxltags)

                noout.update_tab('national', output, hxltags=hxltags)
                excelout.update_tab('national', output, hxltags=hxltags)
                googleout.update_tab('national', output, hxltags=hxltags)
                jsonout.update_tab('national', output, hxltags=hxltags)
                noout.add_additional_json(downloader, today=parse_date('2020-10-01'))
                jsonout.add_additional_json(downloader, today=parse_date('2020-10-01'))
                noout.save()
                excelout.save()
                filepaths = jsonout.save(tempdir, countries_to_save=['AFG'])
                excelsheet = excelout.workbook.get_sheet_by_name(sheetname)

                def get_list_from_cells(cells):
                    result = [list(), list(), list()]
                    for i, row in enumerate(excelsheet[cells]):
                        for column in row:
                            result[i].append(column.value)
                    return result

                assert get_list_from_cells('A1:C3') == output
                spreadsheet = googleout.gc.open_by_url(configuration['googlesheets']['test'])
                googletab = spreadsheet.worksheet_by_title(sheetname)
                result = googletab.get_values(start=(1, 1), end=(3, 3), returnas='matrix')
                result[2][2] = int(result[2][2])
                assert result == output
                assert filecmp.cmp(filepaths[0], join(fixtures, 'test_scraper_all.json'))
                assert filecmp.cmp(filepaths[1], join(fixtures, 'test_scraper_population.json'))
                assert filecmp.cmp(filepaths[2], join(fixtures, 'test_scraper_population.json'))
                assert filecmp.cmp(filepaths[3], join(fixtures, 'test_scraper_other.json'))

                jsonout.json = dict()
                df = pandas.DataFrame(output[2:], columns=output[0])
                noout.update_tab('national', df, hxltags=hxltags)
                excelout.update_tab('national', df, hxltags=hxltags)
                googleout.update_tab('national', df, hxltags=hxltags)
                jsonout.update_tab('national', df, hxltags=hxltags)
                jsonout.add_additional_json(downloader, today=parse_date('2020-10-01'))
                filepaths = jsonout.save(tempdir, countries_to_save=['AFG'])
                assert get_list_from_cells('A1:C3') == output
                result = googletab.get_values(start=(1, 1), end=(3, 3), returnas='matrix')
                result[2][2] = int(result[2][2])
                assert result == output
                assert filecmp.cmp(filepaths[0], join(fixtures, 'test_scraper_all.json'))
                assert filecmp.cmp(filepaths[1], join(fixtures, 'test_scraper_population.json'))
                assert filecmp.cmp(filepaths[2], join(fixtures, 'test_scraper_population.json'))
                assert filecmp.cmp(filepaths[3], join(fixtures, 'test_scraper_other.json'))

                df = pandas.DataFrame(output[1:], columns=output[0])
                googleout.update_tab('national', df, limit=2)
                result = googletab.get_values(start=(1, 1), end=(3, 3), returnas='matrix')
                result[2][2] = int(result[2][2])
コード例 #14
0
    def __init__(self,
                 countryiso3s,
                 adminone,
                 level,
                 today,
                 datasetinfo,
                 headers,
                 subsets,
                 maxdateonly=True):
        # type: (List[str], AdminOne, str, datetime, Dict, List[str], List[Dict], bool) -> None

        def get_level(lvl):
            if isinstance(lvl, str):
                if lvl == 'global':
                    return None
                elif lvl == 'national':
                    return 0
                else:
                    return 1
            return lvl

        self.level = get_level(level)
        self.today = today
        self.sort = datasetinfo.get('sort')
        self.datecol = datasetinfo.get('date_col')
        self.datetype = datasetinfo.get('date_type')
        if self.datetype:
            if self.datetype == 'date':
                date = parse_date('1900-01-01')
            else:
                date = 0
        else:
            date = 0
        self.maxdate = date
        datelevel = datasetinfo.get('date_level')
        if datelevel is None:
            self.datelevel = self.level
        else:
            self.datelevel = get_level(datelevel)
        date_condition = datasetinfo.get('date_condition')
        if date_condition is not None:
            for col in datasetinfo['input_cols']:
                date_condition = date_condition.replace(col, f"row['{col}']")
        self.date_condition = date_condition
        self.single_maxdate = datasetinfo.get('single_maxdate', False)
        self.ignore_future_date = datasetinfo.get('ignore_future_date', True)
        self.adminone = adminone
        self.admcols = datasetinfo.get('adm_cols', list())
        self.admexact = datasetinfo.get('adm_exact', False)
        self.subsets = subsets
        adms = datasetinfo.get('adm_vals')
        if adms is None:
            self.adms = [countryiso3s, self.adminone.pcodes]
        else:
            if self.datelevel == 1:
                self.adms = adms
            else:
                self.adms = [adms, self.adminone.pcodes]
        if self.datelevel is None:
            self.maxdates = {i: date for i, _ in enumerate(subsets)}
        else:
            if self.datelevel > len(self.admcols):
                raise ValueError(
                    'No admin columns specified for required level!')
            self.maxdates = {
                i: {adm: date
                    for adm in self.adms[self.datelevel]}
                for i, _ in enumerate(subsets)
            }

        self.maxdateonly = maxdateonly
        self.flatteninfo = datasetinfo.get('flatten')
        self.headers = headers
        self.filters = dict()
        self.read_external_filter(datasetinfo)
コード例 #15
0
    def parse(self, row, scrapername=None):
        # type: (Dict, str) -> Tuple[Optional[str], Optional[List[bool]]]
        """Parse row checking for valid admin information and if the row should be filtered out in each subset given
        its definition.

        Args:
            row (Dict): Row to parse

        Returns:
            Tuple[Optional[str], Optional[List[bool]]]: (admin name, should process subset list) or (None, None)
        """
        if self.filtered(row):
            return None, None

        adms = [None for _ in range(len(self.admcols))]

        def get_adm(admcol, i):
            template_string, match_string = match_template(admcol)
            if template_string:
                admcol = self.headers[int(match_string)]
            adm = row[admcol]
            if not adm:
                return False
            adm = adm.strip()
            adms[i] = adm
            if adm in self.adms[i]:
                return True
            exact = False
            if self.admexact:
                adms[i] = None
            else:
                if i == 0:
                    adms[i], exact = Country.get_iso3_country_code_fuzzy(adm)
                elif i == 1:
                    adms[i], exact = self.adminone.get_pcode(
                        adms[0], adm, scrapername)
                if adms[i] not in self.adms[i]:
                    adms[i] = None
            return exact

        for i, admcol in enumerate(self.admcols):
            if admcol is None:
                continue
            if isinstance(admcol, str):
                admcol = [admcol]
            for admcl in admcol:
                exact = get_adm(admcl, i)
                if adms[i] and exact:
                    break
            if not adms[i]:
                return None, None

        should_process_subset = list()
        for subset in self.subsets:
            filter = subset['filter']
            process = True
            if filter:
                filters = filter.split('|')
                for filterstr in filters:
                    filter = filterstr.split('=')
                    if row[filter[0]] != filter[1]:
                        process = False
                        break
            should_process_subset.append(process)

        if self.datecol:
            if isinstance(self.datecol, list):
                dates = [str(row[x]) for x in self.datecol]
                date = ''.join(dates)
            else:
                date = row[self.datecol]
            if self.datetype == 'date':
                if not isinstance(date, datetime):
                    date = parse_date(date)
                date = date.replace(tzinfo=None)
                if date > self.today and self.ignore_future_date:
                    return None, None
            elif self.datetype == 'year':
                date = int(date)
                if date > self.today.year and self.ignore_future_date:
                    return None, None
            else:
                date = int(date)
            if self.date_condition:
                if eval(self.date_condition) is False:
                    return None, None
            for i, process in enumerate(should_process_subset):
                if not process:
                    continue
                if date < self.maxdate:
                    if self.single_maxdate:
                        should_process_subset[i] = False
                else:
                    self.maxdate = date
                if self.datelevel is None:
                    if self.maxdateonly:
                        if date < self.maxdates[i]:
                            should_process_subset[i] = False
                        else:
                            self.maxdates[i] = date
                    else:
                        self.maxdates[i] = date
                else:
                    if self.maxdateonly:
                        if date < self.maxdates[i][adms[self.datelevel]]:
                            should_process_subset[i] = False
                        else:
                            self.maxdates[i][adms[self.datelevel]] = date
                    else:
                        self.maxdates[i][adms[self.datelevel]] = date
        if self.level is None:
            return 'global', should_process_subset
        return adms[self.level], should_process_subset
コード例 #16
0
 def test_get_tabular(self, configuration):
     with Download(user_agent='test') as downloader:
         today = parse_date('2020-10-01')
         adminone = AdminOne(configuration)
         population_lookup = dict()
         level = 'national'
         scraper_configuration = configuration[f'scraper_{level}']
         results = run_scrapers(scraper_configuration, ['AFG'],
                                adminone,
                                level,
                                downloader,
                                today=today,
                                scrapers=['population'],
                                population_lookup=population_lookup)
         assert results['headers'] == [['Population'], ['#population']]
         assert results['values'] == [{'AFG': 38041754}]
         assert results['sources'] == [
             ('#population', '2020-10-01', 'World Bank',
              'https://data.humdata.org/organization/world-bank-group')
         ]
         results = run_scrapers(scraper_configuration, ['AFG'],
                                adminone,
                                level,
                                downloader,
                                today=today,
                                scrapers=['who'],
                                population_lookup=population_lookup)
         assert results['headers'] == [[
             'CasesPer100000', 'DeathsPer100000', 'Cases2Per100000',
             'Deaths2Per100000'
         ],
                                       [
                                           '#affected+infected+per100000',
                                           '#affected+killed+per100000',
                                           '#affected+infected+2+per100000',
                                           '#affected+killed+2+per100000'
                                       ]]
         assert results['values'] == [{
             'AFG': '96.99'
         }, {
             'AFG': '3.41'
         }, {
             'AFG': '96.99'
         }, {
             'AFG': '3.41'
         }]
         assert results['sources'] == [
             ('#affected+infected+per100000', '2020-08-06', 'WHO',
              'tests/fixtures/WHO-COVID-19-global-data.csv'),
             ('#affected+killed+per100000', '2020-08-06', 'WHO',
              'tests/fixtures/WHO-COVID-19-global-data.csv'),
             ('#affected+infected+2+per100000', '2020-08-06', 'WHO',
              'tests/fixtures/WHO-COVID-19-global-data.csv'),
             ('#affected+killed+2+per100000', '2020-08-06', 'WHO',
              'tests/fixtures/WHO-COVID-19-global-data.csv')
         ]
         results = run_scrapers(scraper_configuration, ['AFG'],
                                adminone,
                                level,
                                downloader,
                                today=today,
                                scrapers=['access'],
                                population_lookup=population_lookup)
         assert results['headers'] == [
             [
                 '% of visas pending or denied',
                 '% of travel authorizations or movements denied',
                 'Number of incidents reported in previous year',
                 'Number of incidents reported since start of year',
                 'Number of incidents reported since start of previous year',
                 '% of CERF projects affected by insecurity and inaccessibility',
                 '% of CBPF projects affected by insecurity and inaccessibility',
                 'Campaign Vaccine', 'Campaign Vaccine Status',
                 'Number of learners enrolled from pre-primary to tertiary education'
             ],
             [
                 '#access+visas+pct', '#access+travel+pct',
                 '#event+year+previous+num', '#event+year+todate+num',
                 '#event+year+previous+todate+num',
                 '#activity+cerf+project+insecurity+pct',
                 '#activity+cbpf+project+insecurity+pct', '#service+name',
                 '#status+name', '#population+education'
             ]
         ]
         assert results['values'] == [{
             'AFG': 0.2
         }, {
             'AFG': 'N/A'
         }, {
             'AFG': '20'
         }, {
             'AFG': '2'
         }, {
             'AFG': '22'
         }, {
             'AFG': 0.5710000000000001
         }, {
             'AFG': 0.04
         }, {
             'AFG': 'bivalent Oral Poliovirus'
         }, {
             'AFG': 'Postponed'
         }, {
             'AFG': 9979405
         }]
         assert results['sources'] == [
             ('#access+visas+pct', '2020-10-01', 'OCHA',
              'https://docs.google.com/spreadsheets/d/e/2PACX-1vRSzJzuyVt9i_mkRQ2HbxrUl2Lx2VIhkTHQM-laE8NyhQTy70zQTCuFS3PXbhZGAt1l2bkoA4_dAoAP/pub?gid=1565063847&single=true&output=csv'
              ),
             ('#access+travel+pct', '2020-10-01', 'OCHA',
              'https://docs.google.com/spreadsheets/d/e/2PACX-1vRSzJzuyVt9i_mkRQ2HbxrUl2Lx2VIhkTHQM-laE8NyhQTy70zQTCuFS3PXbhZGAt1l2bkoA4_dAoAP/pub?gid=1565063847&single=true&output=csv'
              ),
             ('#event+year+previous+num', '2020-10-01',
              'Aid Workers Database',
              'https://data.humdata.org/dataset/security-incidents-on-aid-workers'
              ),
             ('#event+year+todate+num', '2020-10-01',
              'Aid Workers Database',
              'https://data.humdata.org/dataset/security-incidents-on-aid-workers'
              ),
             ('#event+year+previous+todate+num', '2020-10-01',
              'Aid Workers Database',
              'https://data.humdata.org/dataset/security-incidents-on-aid-workers'
              ),
             ('#activity+cerf+project+insecurity+pct', '2020-10-01',
              'UNCERF',
              'https://docs.google.com/spreadsheets/d/e/2PACX-1vRSzJzuyVt9i_mkRQ2HbxrUl2Lx2VIhkTHQM-laE8NyhQTy70zQTCuFS3PXbhZGAt1l2bkoA4_dAoAP/pub?gid=1565063847&single=true&output=csv'
              ),
             ('#activity+cbpf+project+insecurity+pct', '2020-10-01',
              'UNCERF',
              'https://docs.google.com/spreadsheets/d/e/2PACX-1vRSzJzuyVt9i_mkRQ2HbxrUl2Lx2VIhkTHQM-laE8NyhQTy70zQTCuFS3PXbhZGAt1l2bkoA4_dAoAP/pub?gid=1565063847&single=true&output=csv'
              ),
             ('#service+name', '2020-10-01', 'Multiple sources',
              'https://docs.google.com/spreadsheets/d/e/2PACX-1vRSzJzuyVt9i_mkRQ2HbxrUl2Lx2VIhkTHQM-laE8NyhQTy70zQTCuFS3PXbhZGAt1l2bkoA4_dAoAP/pub?gid=1565063847&single=true&output=csv'
              ),
             ('#status+name', '2020-10-01', 'Multiple sources',
              'https://docs.google.com/spreadsheets/d/e/2PACX-1vRSzJzuyVt9i_mkRQ2HbxrUl2Lx2VIhkTHQM-laE8NyhQTy70zQTCuFS3PXbhZGAt1l2bkoA4_dAoAP/pub?gid=1565063847&single=true&output=csv'
              ),
             ('#population+education', '2020-10-01', 'UNESCO',
              'https://docs.google.com/spreadsheets/d/e/2PACX-1vRSzJzuyVt9i_mkRQ2HbxrUl2Lx2VIhkTHQM-laE8NyhQTy70zQTCuFS3PXbhZGAt1l2bkoA4_dAoAP/pub?gid=1565063847&single=true&output=csv'
              )
         ]
         results = run_scrapers(scraper_configuration, ['AFG'],
                                adminone,
                                level,
                                downloader,
                                today=today,
                                scrapers=['sadd'],
                                population_lookup=population_lookup)
         assert results['headers'] == [[
             'Cases (% male)', 'Cases (% female)', 'Deaths (% male)',
             'Deaths (% female)'
         ],
                                       [
                                           '#affected+infected+m+pct',
                                           '#affected+f+infected+pct',
                                           '#affected+killed+m+pct',
                                           '#affected+f+killed+pct'
                                       ]]
         assert results['values'] == [{
             'AFG': '0.7044'
         }, {
             'AFG': '0.2956'
         }, {
             'AFG': '0.7498'
         }, {
             'AFG': '0.2502'
         }]
         assert results['sources'] == [
             ('#affected+infected+m+pct', '2020-08-07', 'SADD',
              'tests/fixtures/covid-19-sex-disaggregated-data.csv'),
             ('#affected+f+infected+pct', '2020-08-07', 'SADD',
              'tests/fixtures/covid-19-sex-disaggregated-data.csv'),
             ('#affected+killed+m+pct', '2020-08-07', 'SADD',
              'tests/fixtures/covid-19-sex-disaggregated-data.csv'),
             ('#affected+f+killed+pct', '2020-08-07', 'SADD',
              'tests/fixtures/covid-19-sex-disaggregated-data.csv')
         ]
         results = run_scrapers(scraper_configuration, ['AFG', 'PHL'],
                                adminone,
                                level,
                                downloader,
                                today=today,
                                scrapers=['ourworldindata'],
                                population_lookup=population_lookup)
         assert results['headers'] == [[
             'TotalDosesAdministered'
         ], ['#capacity+doses+administered+total']]
         assert results['values'] == [dict()]
         assert results['sources'] == [
             ('#capacity+doses+administered+total', '2020-10-01',
              'Our World in Data',
              'tests/fixtures/ourworldindata_vaccinedoses.csv')
         ]
         today = parse_date('2021-05-03')
         results = run_scrapers(scraper_configuration, ['AFG', 'PHL'],
                                adminone,
                                level,
                                downloader,
                                today=today,
                                scrapers=['ourworldindata'],
                                population_lookup=population_lookup)
         assert results['headers'] == [[
             'TotalDosesAdministered'
         ], ['#capacity+doses+administered+total']]
         assert results['values'] == [{'AFG': '240000'}]
         # NB: Source data will have been written into the in-memory config by the immediately previous run of the
         # ourworldindata scraper and is hence 2020-10-01 instead of 2021-05-03
         assert results['sources'] == [
             ('#capacity+doses+administered+total', '2020-10-01',
              'Our World in Data',
              'tests/fixtures/ourworldindata_vaccinedoses.csv')
         ]
         today = parse_date('2020-10-01')
         level = 'subnational'
         scraper_configuration = configuration[f'scraper_{level}']
         results = run_scrapers(scraper_configuration, ['AFG'],
                                adminone,
                                level,
                                downloader,
                                today=today,
                                scrapers=['gam'],
                                population_lookup=population_lookup)
         assert results['headers'] == [[
             'Malnutrition Estimate'
         ], ['#severity+malnutrition+num+subnational']]
         assert results['values'] == [{
             'AF17': 3.371688,
             'AF31': 3.519166,
             'AF09': 1.524646,
             'AF21': 1.319626,
             'AF10': 1.40426,
             'AF24': 1.043487,
             'AF33': 2.745447,
             'AF29': 2.478977,
             'AF11': 1.022871,
             'AF23': 1.340286,
             'AF30': 1.677612,
             'AF32': 1.687488,
             'AF28': 0.6210205,
             'AF01': 1.282291,
             'AF27': 1.378641,
             'AF02': 3.552082,
             'AF14': 0.7653555,
             'AF15': 0.953823,
             'AF19': 1.684882,
             'AF07': 2.090165,
             'AF05': 0.9474334,
             'AF06': 2.162038,
             'AF34': 1.6455,
             'AF16': 1.927783,
             'AF12': 4.028857,
             'AF13': 9.150105,
             'AF08': 1.64338,
             'AF03': 2.742952,
             'AF20': 1.382376,
             'AF22': 1.523334,
             'AF18': 0.9578965,
             'AF25': 0.580423,
             'AF04': 0.501081,
             'AF26': 4.572629
         }]
         assert results['sources'] == [(
             '#severity+malnutrition+num+subnational', '2020-10-01',
             'UNICEF',
             'tests/fixtures/unicef_who_wb_global_expanded_databases_severe_wasting.xlsx'
         )]
         scraper_configuration = configuration['other']
         results = run_scrapers(scraper_configuration, ['AFG'],
                                adminone,
                                level,
                                downloader,
                                today=today,
                                scrapers=['gam'],
                                population_lookup=population_lookup)
         assert results['headers'] == [[
             'Malnutrition Estimate'
         ], ['#severity+malnutrition+num+subnational']]
         assert results['values'] == [{'AF09': 1.524646, 'AF24': 1.043487}]
         assert results['sources'] == [(
             '#severity+malnutrition+num+subnational', '2020-10-01',
             'UNICEF',
             'tests/fixtures/unicef_who_wb_global_expanded_databases_severe_wasting.xlsx'
         )]
         level = 'global'
         scraper_configuration = configuration[f'scraper_{level}']
         results = run_scrapers(scraper_configuration,
                                configuration['HRPs'],
                                adminone,
                                level,
                                downloader,
                                today=today,
                                scrapers=['covax'],
                                population_lookup=population_lookup)
         assert results['headers'] == [
             [
                 'Covax Interim Forecast Doses', 'Covax Delivered Doses',
                 'Other Delivered Doses', 'Total Delivered Doses',
                 'Covax Pfizer-BioNTech Doses',
                 'Covax Astrazeneca-SII Doses',
                 'Covax Astrazeneca-SKBio Doses'
             ],
             [
                 '#capacity+doses+forecast+covax',
                 '#capacity+doses+delivered+covax',
                 '#capacity+doses+delivered+others',
                 '#capacity+doses+delivered+total',
                 '#capacity+doses+covax+pfizerbiontech',
                 '#capacity+doses+covax+astrazenecasii',
                 '#capacity+doses+covax+astrazenecaskbio'
             ]
         ]
         assert results['values'] == [{
             'global': '73248240'
         }, {
             'global': '12608040'
         }, {
             'global': '23728358'
         }, {
             'global': '36336398'
         }, {
             'global': '271440'
         }, {
             'global': '67116000'
         }, {
             'global': '5860800'
         }]
         assert results['sources'] == [
             ('#capacity+doses+forecast+covax', '2020-08-07', 'covax',
              'tests/fixtures/COVID-19 Vaccine Doses in HRP Countries - Data HXL.csv'
              ),
             ('#capacity+doses+delivered+covax', '2020-08-07', 'covax',
              'tests/fixtures/COVID-19 Vaccine Doses in HRP Countries - Data HXL.csv'
              ),
             ('#capacity+doses+delivered+others', '2020-08-07', 'covax',
              'tests/fixtures/COVID-19 Vaccine Doses in HRP Countries - Data HXL.csv'
              ),
             ('#capacity+doses+delivered+total', '2020-08-07', 'covax',
              'tests/fixtures/COVID-19 Vaccine Doses in HRP Countries - Data HXL.csv'
              ),
             ('#capacity+doses+covax+pfizerbiontech', '2020-08-07', 'covax',
              'tests/fixtures/COVID-19 Vaccine Doses in HRP Countries - Data HXL.csv'
              ),
             ('#capacity+doses+covax+astrazenecasii', '2020-08-07', 'covax',
              'tests/fixtures/COVID-19 Vaccine Doses in HRP Countries - Data HXL.csv'
              ),
             ('#capacity+doses+covax+astrazenecaskbio', '2020-08-07',
              'covax',
              'tests/fixtures/COVID-19 Vaccine Doses in HRP Countries - Data HXL.csv'
              )
         ]
         results = run_scrapers(scraper_configuration,
                                configuration['HRPs'],
                                adminone,
                                level,
                                downloader,
                                today=today,
                                scrapers=['cerf_global'],
                                population_lookup=population_lookup)
         assert results['headers'] == [
             [
                 'CBPFFunding', 'CBPFFundingGMEmpty', 'CBPFFundingGM0',
                 'CBPFFundingGM1', 'CBPFFundingGM2', 'CBPFFundingGM3',
                 'CBPFFundingGM4', 'CERFFunding', 'CERFFundingGMEmpty',
                 'CERFFundingGM0', 'CERFFundingGM1', 'CERFFundingGM2',
                 'CERFFundingGM3', 'CERFFundingGM4'
             ],
             [
                 '#value+cbpf+funding+total+usd',
                 '#value+cbpf+funding+gmempty+total+usd',
                 '#value+cbpf+funding+gm0+total+usd',
                 '#value+cbpf+funding+gm1+total+usd',
                 '#value+cbpf+funding+gm2+total+usd',
                 '#value+cbpf+funding+gm3+total+usd',
                 '#value+cbpf+funding+gm4+total+usd',
                 '#value+cerf+funding+total+usd',
                 '#value+cerf+funding+gmempty+total+usd',
                 '#value+cerf+funding+gm0+total+usd',
                 '#value+cerf+funding+gm1+total+usd',
                 '#value+cerf+funding+gm2+total+usd',
                 '#value+cerf+funding+gm3+total+usd',
                 '#value+cerf+funding+gm4+total+usd'
             ]
         ]
         assert results['values'] == [{
             'global': 906790749.5500005
         }, {
             'global': 829856355.4100008
         }, {
             'global': 37432868.04999999
         }, {
             'global': 39501526.08999999
         }, {}, {}, {}, {
             'global': 848145238.0
         }, {}, {
             'global': 50042305.0
         }, {
             'global': 75349572.0
         }, {
             'global': 224560378.0
         }, {
             'global': 349338181.0
         }, {
             'global': 147855321.0
         }]
         assert results['sources'] == [
             ('#value+cbpf+funding+total+usd', '2020-10-01',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cbpf+funding+gmempty+total+usd', '2020-10-01',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cbpf+funding+gm0+total+usd', '2020-10-01',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cbpf+funding+gm1+total+usd', '2020-10-01',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cbpf+funding+gm2+total+usd', '2020-10-01',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cbpf+funding+gm3+total+usd', '2020-10-01',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cbpf+funding+gm4+total+usd', '2020-10-01',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cerf+funding+total+usd', '2020-10-01',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cerf+funding+gmempty+total+usd', '2020-10-01',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cerf+funding+gm0+total+usd', '2020-10-01',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cerf+funding+gm1+total+usd', '2020-10-01',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cerf+funding+gm2+total+usd', '2020-10-01',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cerf+funding+gm3+total+usd', '2020-10-01',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cerf+funding+gm4+total+usd', '2020-10-01',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations')
         ]
         results = run_scrapers(scraper_configuration,
                                configuration['HRPs'],
                                adminone,
                                level,
                                downloader,
                                today=today,
                                scrapers=['ourworldindata'],
                                population_lookup=population_lookup)
         assert results['headers'] == [[
             'TotalDosesAdministered'
         ], ['#capacity+doses+administered+total']]
         assert results['values'] == [dict()]
         assert results['sources'] == [
             ('#capacity+doses+administered+total', '2020-10-01',
              'Our World in Data',
              'tests/fixtures/ourworldindata_vaccinedoses.csv')
         ]
         today = parse_date('2021-05-03')
         results = run_scrapers(scraper_configuration,
                                configuration['HRPs'],
                                adminone,
                                level,
                                downloader,
                                today=today,
                                scrapers=['cerf_global'],
                                population_lookup=population_lookup)
         assert results['headers'] == [
             [
                 'CBPFFunding', 'CBPFFundingGMEmpty', 'CBPFFundingGM0',
                 'CBPFFundingGM1', 'CBPFFundingGM2', 'CBPFFundingGM3',
                 'CBPFFundingGM4', 'CERFFunding', 'CERFFundingGMEmpty',
                 'CERFFundingGM0', 'CERFFundingGM1', 'CERFFundingGM2',
                 'CERFFundingGM3', 'CERFFundingGM4'
             ],
             [
                 '#value+cbpf+funding+total+usd',
                 '#value+cbpf+funding+gmempty+total+usd',
                 '#value+cbpf+funding+gm0+total+usd',
                 '#value+cbpf+funding+gm1+total+usd',
                 '#value+cbpf+funding+gm2+total+usd',
                 '#value+cbpf+funding+gm3+total+usd',
                 '#value+cbpf+funding+gm4+total+usd',
                 '#value+cerf+funding+total+usd',
                 '#value+cerf+funding+gmempty+total+usd',
                 '#value+cerf+funding+gm0+total+usd',
                 '#value+cerf+funding+gm1+total+usd',
                 '#value+cerf+funding+gm2+total+usd',
                 '#value+cerf+funding+gm3+total+usd',
                 '#value+cerf+funding+gm4+total+usd'
             ]
         ]
         assert results['values'] == [{
             'global': 7811774.670000001
         }, {
             'global': 7811774.670000001
         }, {}, {}, {}, {}, {}, {
             'global': 89298919.0
         }, {
             'global': 6747034.0
         }, {}, {
             'global': 2549855.0
         }, {
             'global': 10552572.0
         }, {
             'global': 26098816.0
         }, {
             'global': 43350642.0
         }]
         assert results['sources'] == [
             ('#value+cbpf+funding+total+usd', '2021-05-03',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cbpf+funding+gmempty+total+usd', '2021-05-03',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cbpf+funding+gm0+total+usd', '2021-05-03',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cbpf+funding+gm1+total+usd', '2021-05-03',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cbpf+funding+gm2+total+usd', '2021-05-03',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cbpf+funding+gm3+total+usd', '2021-05-03',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cbpf+funding+gm4+total+usd', '2021-05-03',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cerf+funding+total+usd', '2021-05-03',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cerf+funding+gmempty+total+usd', '2021-05-03',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cerf+funding+gm0+total+usd', '2021-05-03',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cerf+funding+gm1+total+usd', '2021-05-03',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cerf+funding+gm2+total+usd', '2021-05-03',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cerf+funding+gm3+total+usd', '2021-05-03',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations'),
             ('#value+cerf+funding+gm4+total+usd', '2021-05-03',
              'CERF and CBPF',
              'https://data.humdata.org/dataset/cerf-covid-19-allocations')
         ]
         results = run_scrapers(scraper_configuration,
                                configuration['HRPs'],
                                adminone,
                                level,
                                downloader,
                                today=today,
                                scrapers=['ourworldindata'],
                                population_lookup=population_lookup)
         assert results['headers'] == [[
             'TotalDosesAdministered'
         ], ['#capacity+doses+administered+total']]
         assert results['values'] == [{'global': '13413871'}]
         assert results['sources'] == [
             ('#capacity+doses+administered+total', '2020-10-01',
              'Our World in Data',
              'tests/fixtures/ourworldindata_vaccinedoses.csv')
         ]
         scraper_configuration = configuration['other']
         results = run_scrapers(scraper_configuration,
                                configuration['HRPs'],
                                adminone,
                                level,
                                downloader,
                                today=today,
                                scrapers=['ourworldindata'],
                                population_lookup=population_lookup)
         assert results['headers'] == [[
             'TotalDosesAdministered'
         ], ['#capacity+doses+administered+total']]
         assert results['values'] == [{'global': '1175451507'}]
         assert results['sources'] == [
             ('#capacity+doses+administered+total', '2021-05-03',
              'Our World in Data',
              'tests/fixtures/ourworldindata_vaccinedoses.csv')
         ]
コード例 #17
0
    def update(
        self,
        sheetname: str,
        rows: List[Dict],
        dutyofficer_name: Optional[str] = None,
    ) -> None:
        """Update output Google spreadsheet (which must have been set up with
        setup_gsheet). The duty officer which is usually taken from the HDX Data
        Partnerships Team Duty Roster spreadsheet can be overridden by supplying
        dutyofficer_name.

        Args:
            sheetname (str): Name of tab in Google spreadsheet to output to
            rows (List[Dict]): Rows to add to Google spreadsheet
            dutyofficer_name (Optional[str]): Name of duty office. Defaults to None.

        Returns:
            None
        """

        if self.issues_spreadsheet is None or (self.dutyofficer is None
                                               and dutyofficer_name is None):
            logger.warning("Cannot update Google spreadsheet!")
            return
        logger.info("Updating Google spreadsheet.")
        sheet = self.issues_spreadsheet.worksheet(sheetname)
        gsheet_rows = sheet.get_values()
        keys = gsheet_rows[0]
        url_ind = keys.index("URL")
        if "Update Frequency" in keys:
            update_frequency_ind = keys.index("Update Frequency")
        else:
            update_frequency_ind = None
        dateadded_ind = keys.index("Date Added")
        dateoccurred_ind = keys.index("Date Last Occurred")
        no_times_ind = keys.index("No. Times")
        assigned_ind = keys.index("Assigned")
        status_ind = keys.index("Status")
        headers = gsheet_rows[0]
        gsheet_rows = [row for row in gsheet_rows[1:] if row[url_ind]]
        urls = [x[url_ind] for x in gsheet_rows]
        if update_frequency_ind is not None:
            for gsheet_row in gsheet_rows:
                updatefreq = gsheet_row[update_frequency_ind]
                gsheet_row[update_frequency_ind] = int(
                    Dataset.transform_update_frequency(updatefreq))
        updated_notimes = set()
        now = self.now.isoformat()
        for row in rows:
            url = row["URL"]
            new_row = [row.get(key, "") for key in keys]
            new_row[dateoccurred_ind] = now
            try:
                rowno = urls.index(url)
                current_row = gsheet_rows[rowno]
                new_row[dateadded_ind] = current_row[dateadded_ind]
                no_times = current_row[no_times_ind]
                new_row[no_times_ind] = int(no_times)
                if url not in updated_notimes:
                    updated_notimes.add(url)
                    new_row[no_times_ind] += 1
                new_row[assigned_ind] = current_row[assigned_ind]
                new_row[status_ind] = current_row[status_ind]
                gsheet_rows[rowno] = new_row
            except ValueError:
                new_row[dateadded_ind] = now
                new_row[no_times_ind] = 1
                if dutyofficer_name is not None:
                    new_row[assigned_ind] = dutyofficer_name
                else:
                    new_row[assigned_ind] = self.dutyofficer["name"]
                gsheet_rows.append(new_row)
                urls.append(url)
                updated_notimes.add(url)
        if update_frequency_ind is None:
            gsheet_rows = sorted(gsheet_rows,
                                 key=lambda x: x[dateoccurred_ind],
                                 reverse=True)
        else:
            headers.append("sort")
            sort_ind = headers.index("sort")
            for gsheet_row in gsheet_rows:
                dateoccurred = gsheet_row[dateoccurred_ind]
                if dateoccurred == now:
                    sort_val = 0
                else:
                    nodays = self.now - parse_date(dateoccurred)
                    update_freq = gsheet_row[update_frequency_ind]
                    if update_freq == -1:
                        update_freq = 1000
                    elif update_freq == -2:
                        update_freq = 500
                    elif update_freq == 0:
                        update_freq = 0.5
                    sort_val = nodays.days / update_freq
                gsheet_row.append(sort_val)
            gsheet_rows = sorted(
                gsheet_rows,
                key=lambda x: (-x[sort_ind], x[dateoccurred_ind]),
                reverse=True,
            )
        no_rows = len(gsheet_rows)
        no_rows_to_remove = no_rows - self.row_limit
        gsheet_rows = gsheet_rows[:-no_rows_to_remove]

        if update_frequency_ind is not None:
            for gsheet_row in gsheet_rows:
                update_freq = gsheet_row[update_frequency_ind]
                gsheet_row[
                    update_frequency_ind] = HDXHelper.get_update_frequency(
                        update_freq)
                del gsheet_row[sort_ind]
            del headers[sort_ind]
        sheet.clear()
        sheet.update("A1", [headers] + gsheet_rows)
コード例 #18
0
def _get_tabular(level,
                 name,
                 datasetinfo,
                 headers,
                 iterator,
                 population_lookup,
                 retheaders=[list(), list()],
                 retval=list(),
                 sources=list()):
    indicatorcols = datasetinfo.get('indicator_cols')
    if not indicatorcols:
        indicatorcols = [{
            'filter_col': datasetinfo.get('filter_col'),
            'val_cols': datasetinfo.get('val_cols', list()),
            'val_fns': datasetinfo.get('val_fns', dict()),
            'eval_cols': datasetinfo.get('eval_cols', list()),
            'keep_cols': datasetinfo.get('keep_cols', list()),
            'append_cols': datasetinfo.get('append_cols', list()),
            'total_cols': datasetinfo.get('total_cols'),
            'ignore_vals': datasetinfo.get('ignore_vals', list()),
            'columns': datasetinfo.get('columns', list()),
            'hxltags': datasetinfo.get('hxltags', list())
        }]
    use_hxl = datasetinfo.get('use_hxl', False)
    if use_hxl:
        hxlrow = next(iterator)
        while not hxlrow:
            hxlrow = next(iterator)
        exclude_tags = datasetinfo.get('exclude_tags', list())
        adm_cols = list()
        val_cols = list()
        columns = list()
        for header in headers:
            hxltag = hxlrow[header]
            if not hxltag or hxltag in exclude_tags:
                continue
            if '#country' in hxltag:
                if 'code' in hxltag:
                    if len(adm_cols) == 0:
                        adm_cols.append(hxltag)
                    else:
                        adm_cols[0] = hxltag
                continue
            if '#adm1' in hxltag:
                if 'code' in hxltag:
                    if len(adm_cols) == 0:
                        adm_cols.append(None)
                    if len(adm_cols) == 1:
                        adm_cols.append(hxltag)
                continue
            if hxltag == datasetinfo.get('date_col') and datasetinfo.get(
                    'include_date', False) is False:
                continue
            val_cols.append(hxltag)
            columns.append(header)
        datasetinfo['adm_cols'] = adm_cols
        for indicatorcol in indicatorcols:
            orig_val_cols = indicatorcol.get('val_cols', list())
            if not orig_val_cols:
                orig_val_cols.extend(val_cols)
            indicatorcol['val_cols'] = orig_val_cols
            orig_columns = indicatorcol.get('columns', list())
            if not orig_columns:
                orig_columns.extend(columns)
            indicatorcol['columns'] = orig_columns
            orig_hxltags = indicatorcol.get('hxltags', list())
            if not orig_hxltags:
                orig_hxltags.extend(val_cols)
            indicatorcol['hxltags'] = orig_hxltags
    else:
        hxlrow = None

    rowparser = RowParser(level, datasetinfo, headers, indicatorcols)
    valuedicts = dict()
    for indicatorcol in indicatorcols:
        for _ in indicatorcol['val_cols']:
            dict_of_lists_add(valuedicts, indicatorcol['filter_col'], dict())

    def add_row(row):
        adm, indicators_process = rowparser.do_set_value(row, name)
        if not adm:
            return
        for i, indicatorcol in enumerate(indicatorcols):
            if not indicators_process[i]:
                continue
            filtercol = indicatorcol['filter_col']
            total_cols = indicatorcol.get('total_cols')
            eval_cols = indicatorcol.get('eval_cols')
            append_cols = indicatorcol.get('append_cols', list())
            keep_cols = indicatorcol.get('keep_cols', list())
            for i, valcol in enumerate(indicatorcol['val_cols']):
                valuedict = valuedicts[filtercol][i]
                val = get_rowval(row, valcol)
                if total_cols or eval_cols:
                    dict_of_lists_add(valuedict, adm, val)
                else:
                    curval = valuedict.get(adm)
                    if valcol in append_cols:
                        if curval:
                            val = curval + val
                    elif valcol in keep_cols:
                        if curval:
                            val = curval
                    valuedict[adm] = val

    stop_row = datasetinfo.get('stop_row')
    for row in iterator:
        if not isinstance(row, dict):
            row = row.value
        if hxlrow:
            newrow = dict()
            for header in row:
                newrow[hxlrow[header]] = row[header]
            row = newrow
        if stop_row:
            if all(row[key] == value for key, value in stop_row.items()):
                break
        for newrow in rowparser.flatten(row):
            add_row(newrow)

    date = datasetinfo.get('date')
    use_date_from_date_col = datasetinfo.get('use_date_from_date_col', False)
    if date and not use_date_from_date_col:
        date = parse_date(date)
    else:
        date = rowparser.get_maxdate()
        if date == 0:
            raise ValueError('No date given in datasetinfo or as a column!')
        if rowparser.datetype == 'date':
            if not isinstance(date, datetime):
                date = parse_date(date)
        elif rowparser.datetype == 'int':
            date = get_datetime_from_timestamp(date)
        else:
            raise ValueError('No date type specified!')
    date = date.strftime('%Y-%m-%d')

    for indicatorcol in indicatorcols:
        retheaders[0].extend(indicatorcol['columns'])
        hxltags = indicatorcol['hxltags']
        retheaders[1].extend(hxltags)
        valdicts = valuedicts[indicatorcol['filter_col']]
        eval_cols = indicatorcol.get('eval_cols')
        keep_cols = indicatorcol.get('keep_cols', list())
        total_cols = indicatorcol.get('total_cols')
        ignore_vals = indicatorcol.get('ignore_vals', list())
        val_fns = indicatorcol.get('val_fns', dict())
        valcols = indicatorcol['val_cols']
        # Indices of list sorted by length
        sorted_len_indices = sorted(range(len(valcols)),
                                    key=lambda k: len(valcols[k]),
                                    reverse=True)
        if eval_cols:
            newvaldicts = [dict() for _ in eval_cols]

            def text_replacement(string, adm):
                string = string.replace('#population', '#pzbgvjh')
                hasvalues = False
                for j in sorted_len_indices:
                    valcol = valcols[j]
                    if valcol not in string:
                        continue
                    if valcol in keep_cols:
                        keep_col_index = 0
                    else:
                        keep_col_index = -1
                    val = valdicts[j][adm][keep_col_index]
                    if not val or val in ignore_vals:
                        val = 0
                    else:
                        val_fn = val_fns.get(valcol)
                        if val_fn:
                            val = eval(val_fn.replace(valcol, 'val'))
                        hasvalues = True
                    string = string.replace(valcol, str(val))
                string = string.replace('#pzbgvjh', '#population')
                return string, hasvalues

            for i, eval_col in enumerate(eval_cols):
                valdict0 = valdicts[0]
                for adm in valdict0:
                    hasvalues = True
                    matches = regex.search(brackets,
                                           eval_col,
                                           flags=regex.VERBOSE)
                    if matches:
                        for bracketed_str in matches.captures('rec'):
                            if any(bracketed_str in x for x in valcols):
                                continue
                            _, hasvalues_t = text_replacement(
                                bracketed_str, adm)
                            if not hasvalues_t:
                                hasvalues = False
                                break
                    if hasvalues:
                        formula, hasvalues_t = text_replacement(eval_col, adm)
                        if hasvalues_t:
                            formula = formula.replace(
                                '#population', 'population_lookup[adm]')
                            newvaldicts[i][adm] = eval(formula)
                        else:
                            newvaldicts[i][adm] = ''
                    else:
                        newvaldicts[i][adm] = ''
            retval.extend(newvaldicts)
        elif total_cols:
            for total_col in total_cols:
                formula = total_col['formula']
                mustbepopulated = total_col.get('mustbepopulated', False)
                newvaldicts = [dict() for _ in valdicts]
                valdict0 = valdicts[0]
                for adm in valdict0:
                    for i, val in enumerate(valdict0[adm]):
                        if not val or val in ignore_vals:
                            exists = False
                        else:
                            exists = True
                            for valdict in valdicts[1:]:
                                val = valdict[adm]
                                if not val or val in ignore_vals:
                                    exists = False
                                    break
                        if mustbepopulated and not exists:
                            continue
                        for j, valdict in enumerate(valdicts):
                            valcol = valcols[j]
                            val_fn = val_fns.get(valcol)
                            if not val_fn:
                                val_fn = valcol
                            newvaldicts[j][adm] = newvaldicts[j].get(
                                adm, 0.0
                            ) + eval(
                                val_fn.replace(
                                    valcol,
                                    'get_numeric_if_possible(valdict[adm][i])')
                            )
                formula = formula.replace('#population', '#pzbgvjh')
                for i in sorted_len_indices:
                    formula = formula.replace(valcols[i],
                                              'newvaldicts[%d][adm]' % i)
                formula = formula.replace('#pzbgvjh', 'population_lookup[adm]')
                newvaldict = dict()
                for adm in valdicts[0].keys():
                    try:
                        val = eval(formula)
                    except (ValueError, TypeError, KeyError):
                        val = ''
                    newvaldict[adm] = val
                retval.append(newvaldict)
        else:
            retval.extend(valdicts)

        sources.extend([(hxltag, date, datasetinfo['source'],
                         datasetinfo['source_url']) for hxltag in hxltags])
    logger.info('Processed %s' % name)
    return retheaders, retval, sources
コード例 #19
0
def get_education(configuration,
                  today,
                  countryiso3s,
                  regionlookup,
                  downloader,
                  scrapers=None):
    name = 'education'
    if scrapers and not any(scraper in name for scraper in scrapers):
        return list(), list(), list(), list(), list(), list()
    educationinfo = configuration[name]
    datasetinfo = educationinfo['closures']
    closures_headers, closures_iterator = read(downloader, datasetinfo)
    closures = dict()
    country_dates = dict()
    for row in closures_iterator:
        countryiso = row['ISO']
        if not countryiso or countryiso not in countryiso3s:
            continue
        date = row['Date']
        if isinstance(date, str):
            date = parse_date(date)
        if date > today:
            continue
        max_date = country_dates.get(countryiso, default_date)
        if date < max_date:
            continue
        country_dates[countryiso] = date
        closures[countryiso] = row['Status']
    fully_closed = list()
    for countryiso, closure in closures.items():
        if closure.lower() == 'closed due to covid-19':
            fully_closed.append(countryiso)
    datasetinfo = educationinfo['enrolment']
    learners_headers, learners_iterator = read(downloader, datasetinfo)
    learners_012 = dict()
    learners_3 = dict()
    affected_learners = dict()
    all_learners = dict()

    for row in learners_iterator:
        countryiso = row['ISO3']
        if not countryiso or countryiso not in countryiso3s:
            continue
        l_0 = row['Pre-primary (both)']
        l_1 = row['Primary (both)']
        l_2 = row['Secondary (both)']
        l_3 = row['Tertiary (both)']
        l_012 = None
        if l_0 != '-':
            l_012 = int(l_0)
        if l_1 != '-':
            l_1 = int(l_1)
            if l_012 is None:
                l_012 = l_1
            else:
                l_012 += l_1
        if l_2 != '-':
            l_2 = int(l_2)
            if l_012 is None:
                l_012 = l_2
            else:
                l_012 += l_2
        if l_012 is not None:
            learners_012[countryiso] = l_012
        if l_3 == '-':
            l_3 = None
        else:
            l_3 = int(l_3)
            learners_3[countryiso] = l_3
        no_learners = None
        if l_012 is not None:
            no_learners = l_012
            if l_3:
                no_learners += l_3
        elif l_3 is not None:
            no_learners = l_3
        if no_learners is not None:
            all_learners[countryiso] = no_learners
            if countryiso in fully_closed:
                affected_learners[countryiso] = no_learners
    affected_learners_total = dict()
    learners_total = dict()
    closed_countries = dict()
    for countryiso in closures:
        country_learners = all_learners.get(countryiso)
        country_affected_learners = affected_learners.get(countryiso)
        for region in regionlookup.iso3_to_region_and_hrp[countryiso]:
            if country_learners is not None:
                learners_total[region] = learners_total.get(
                    region, 0) + country_learners
            if country_affected_learners is not None:
                affected_learners_total[region] = affected_learners_total.get(
                    region, 0) + country_affected_learners
                closed_countries[region] = closed_countries.get(region, 0) + 1
    percentage_affected_learners = dict()
    for region, no_learners in affected_learners_total.items():
        percentage_affected_learners[region] = get_fraction_str(
            no_learners, learners_total[region])
    logger.info('Processed education')
    grheaders = [
        'No. affected learners', 'Percentage affected learners',
        'No. closed countries'
    ]
    grhxltags = [
        '#affected+learners', '#affected+learners+pct',
        '#status+country+closed'
    ]
    headers = [
        'School Closure', 'No. pre-primary to upper-secondary learners',
        'No. tertiary learners', 'No. affected learners'
    ]
    hxltags = [
        '#impact+type', '#population+learners+pre_primary_to_secondary',
        '#population+learners+tertiary', '#affected+learners'
    ]
    return [grheaders, grhxltags], [affected_learners_total, percentage_affected_learners, closed_countries], \
           [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url']) for hxltag in hxltags], \
           [headers, hxltags], [closures, learners_012, learners_3, affected_learners], \
           [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url']) for hxltag in hxltags]
コード例 #20
0
def _run_scraper(countryiso3s, adminone, level, today, name, datasetinfo,
                 headers, iterator, population_lookup, results):
    # type: (List[str], AdminOne, str, datetime, str, Dict, List[str], Iterator[Union[List,Dict]], Dict[str,int], Dict) -> None
    """Run one mini scraper.

    Args:
        countryiso3s (List[str]): List of ISO3 country codes to process
        adminone (AdminOne): AdminOne object from HDX Python Country library that handles processing of admin level 1
        level (str): Can be global, national or subnational
        today (datetime): Value to use for today. Defaults to None (datetime.now()).
        name (str): Name of mini scraper
        datasetinfo (Dict): Dictionary of information about dataset
        headers (List[str]): Row headers
        iterator (Iterator[Union[List,Dict]]): Rows
        population_lookup (Dict[str,int]): Dictionary from admin code to population
        results (Dict): Dictionary of output containing output headers, values and sources

    Returns:
        Tuple[Optional[str], Optional[List[bool]]]: (admin name, should process subset list) or (None, None)
    """
    subsets = datasetinfo.get('subsets')
    if not subsets:
        subsets = [{
            'filter':
            datasetinfo.get('filter'),
            'input_cols':
            datasetinfo.get('input_cols', list()),
            'input_transforms':
            datasetinfo.get('input_transforms', dict()),
            'process_cols':
            datasetinfo.get('process_cols', list()),
            'input_keep':
            datasetinfo.get('input_keep', list()),
            'input_append':
            datasetinfo.get('input_append', list()),
            'sum_cols':
            datasetinfo.get('sum_cols'),
            'input_ignore_vals':
            datasetinfo.get('input_ignore_vals', list()),
            'output_cols':
            datasetinfo.get('output_cols', list()),
            'output_hxltags':
            datasetinfo.get('output_hxltags', list())
        }]
    use_hxl = datasetinfo.get('use_hxl', False)
    if use_hxl:
        hxlrow = next(iterator)
        while not hxlrow:
            hxlrow = next(iterator)
        exclude_tags = datasetinfo.get('exclude_tags', list())
        adm_cols = list()
        input_cols = list()
        columns = list()
        for header in headers:
            hxltag = hxlrow[header]
            if not hxltag or hxltag in exclude_tags:
                continue
            if '#country' in hxltag:
                if 'code' in hxltag:
                    if len(adm_cols) == 0:
                        adm_cols.append(hxltag)
                    else:
                        adm_cols[0] = hxltag
                continue
            if '#adm1' in hxltag:
                if 'code' in hxltag:
                    if len(adm_cols) == 0:
                        adm_cols.append(None)
                    if len(adm_cols) == 1:
                        adm_cols.append(hxltag)
                continue
            if hxltag == datasetinfo.get('date_col') and datasetinfo.get(
                    'include_date', False) is False:
                continue
            input_cols.append(hxltag)
            columns.append(header)
        datasetinfo['adm_cols'] = adm_cols
        for subset in subsets:
            orig_input_cols = subset.get('input_cols', list())
            if not orig_input_cols:
                orig_input_cols.extend(input_cols)
            subset['input_cols'] = orig_input_cols
            orig_columns = subset.get('output_cols', list())
            if not orig_columns:
                orig_columns.extend(columns)
            subset['output_cols'] = orig_columns
            orig_hxltags = subset.get('output_hxltags', list())
            if not orig_hxltags:
                orig_hxltags.extend(input_cols)
            subset['output_hxltags'] = orig_hxltags
    else:
        hxlrow = None

    rowparser = RowParser(countryiso3s, adminone, level, today, datasetinfo,
                          headers, subsets)
    iterator = rowparser.sort_rows(iterator, hxlrow)
    valuedicts = dict()
    for subset in subsets:
        for _ in subset['input_cols']:
            dict_of_lists_add(valuedicts, subset['filter'], dict())

    def add_row(row):
        adm, should_process_subset = rowparser.parse(row, name)
        if not adm:
            return
        for i, subset in enumerate(subsets):
            if not should_process_subset[i]:
                continue
            filter = subset['filter']
            input_ignore_vals = subset.get('input_ignore_vals', list())
            input_transforms = subset.get('input_transforms', dict())
            sum_cols = subset.get('sum_cols')
            process_cols = subset.get('process_cols')
            input_append = subset.get('input_append', list())
            input_keep = subset.get('input_keep', list())
            for i, valcol in enumerate(subset['input_cols']):
                valuedict = valuedicts[filter][i]
                val = get_rowval(row, valcol)
                input_transform = input_transforms.get(valcol)
                if input_transform and val not in input_ignore_vals:
                    val = eval(input_transform.replace(valcol, 'val'))
                if sum_cols or process_cols:
                    dict_of_lists_add(valuedict, adm, val)
                else:
                    curval = valuedict.get(adm)
                    if valcol in input_append:
                        if curval:
                            val = curval + val
                    elif valcol in input_keep:
                        if curval:
                            val = curval
                    valuedict[adm] = val

    stop_row = datasetinfo.get('stop_row')
    for row in iterator:
        if not isinstance(row, dict):
            row = row.value
        if hxlrow:
            newrow = dict()
            for header in row:
                newrow[hxlrow[header]] = row[header]
            row = newrow
        if stop_row:
            if all(row[key] == value for key, value in stop_row.items()):
                break
        for newrow in rowparser.flatten(row):
            add_row(newrow)

    date = datasetinfo.get('date')
    use_date_from_date_col = datasetinfo.get('use_date_from_date_col', False)
    if date and not use_date_from_date_col:
        date = parse_date(date)
    else:
        date = rowparser.get_maxdate()
        if date == 0:
            raise ValueError('No date given in datasetinfo or as a column!')
        if rowparser.datetype == 'date':
            if not isinstance(date, datetime):
                date = parse_date(date)
        elif rowparser.datetype == 'int':
            date = get_datetime_from_timestamp(date)
        else:
            raise ValueError('No date type specified!')
    date = date.strftime('%Y-%m-%d')

    retheaders = results['headers']
    retvalues = results['values']
    sources = results['sources']
    for subset in subsets:
        output_cols = subset['output_cols']
        retheaders[0].extend(output_cols)
        output_hxltags = subset['output_hxltags']
        retheaders[1].extend(output_hxltags)
        valdicts = valuedicts[subset['filter']]
        process_cols = subset.get('process_cols')
        input_keep = subset.get('input_keep', list())
        sum_cols = subset.get('sum_cols')
        input_ignore_vals = subset.get('input_ignore_vals', list())
        valcols = subset['input_cols']
        # Indices of list sorted by length
        sorted_len_indices = sorted(range(len(valcols)),
                                    key=lambda k: len(valcols[k]),
                                    reverse=True)
        if process_cols:
            newvaldicts = [dict() for _ in process_cols]

            def text_replacement(string, adm):
                string = string.replace('#population', '#pzbgvjh')
                hasvalues = False
                for j in sorted_len_indices:
                    valcol = valcols[j]
                    if valcol not in string:
                        continue
                    if valcol in input_keep:
                        input_keep_index = 0
                    else:
                        input_keep_index = -1
                    val = valdicts[j][adm][input_keep_index]
                    if val is None or val == '' or val in input_ignore_vals:
                        val = 0
                    else:
                        hasvalues = True
                    string = string.replace(valcol, str(val))
                string = string.replace('#pzbgvjh', '#population')
                return string, hasvalues

            for i, process_col in enumerate(process_cols):
                valdict0 = valdicts[0]
                for adm in valdict0:
                    hasvalues = True
                    matches = regex.search(brackets,
                                           process_col,
                                           flags=regex.VERBOSE)
                    if matches:
                        for bracketed_str in matches.captures('rec'):
                            if any(bracketed_str in x for x in valcols):
                                continue
                            _, hasvalues_t = text_replacement(
                                bracketed_str, adm)
                            if not hasvalues_t:
                                hasvalues = False
                                break
                    if hasvalues:
                        formula, hasvalues_t = text_replacement(
                            process_col, adm)
                        if hasvalues_t:
                            formula = formula.replace(
                                '#population', 'population_lookup[adm]')
                            newvaldicts[i][adm] = eval(formula)
                        else:
                            newvaldicts[i][adm] = ''
                    else:
                        newvaldicts[i][adm] = ''
            retvalues.extend(newvaldicts)
        elif sum_cols:
            for sum_col in sum_cols:
                formula = sum_col['formula']
                mustbepopulated = sum_col.get('mustbepopulated', False)
                newvaldicts = [dict() for _ in valdicts]
                valdict0 = valdicts[0]
                for adm in valdict0:
                    for i, val in enumerate(valdict0[adm]):
                        if not val or val in input_ignore_vals:
                            exists = False
                        else:
                            exists = True
                            for valdict in valdicts[1:]:
                                val = valdict[adm][i]
                                if val is None or val == '' or val in input_ignore_vals:
                                    exists = False
                                    break
                        if mustbepopulated and not exists:
                            continue
                        for j, valdict in enumerate(valdicts):
                            val = valdict[adm][i]
                            if val is None or val == '' or val in input_ignore_vals:
                                continue
                            newvaldicts[j][adm] = eval(
                                f'newvaldicts[j].get(adm, 0.0) + {str(valdict[adm][i])}'
                            )
                formula = formula.replace('#population', '#pzbgvjh')
                for i in sorted_len_indices:
                    formula = formula.replace(valcols[i],
                                              'newvaldicts[%d][adm]' % i)
                formula = formula.replace('#pzbgvjh', 'population_lookup[adm]')
                newvaldict = dict()
                for adm in valdicts[0].keys():
                    try:
                        val = eval(formula)
                    except (ValueError, TypeError, KeyError):
                        val = ''
                    newvaldict[adm] = val
                retvalues.append(newvaldict)
        else:
            retvalues.extend(valdicts)
        source = datasetinfo['source']
        if isinstance(source, str):
            source = {'default_source': source}
        source_url = datasetinfo['source_url']
        if isinstance(source_url, str):
            source_url = {'default_url': source_url}
        sources.extend([(hxltag, date,
                         source.get(hxltag, source['default_source']),
                         source_url.get(hxltag, source_url['default_url']))
                        for hxltag in output_hxltags])
    logger.info('Processed %s' % name)
コード例 #21
0
ファイル: run.py プロジェクト: mcarans/crisis-casestudy
def main():
    configuration = Configuration.read()
    enddays = configuration['enddays']
    ignore_users = configuration['ignore_users']
    users_scrapers = configuration['users_scrapers']
    spreadsheet_url = configuration['spreadsheet_url']
    sheetname = configuration['sheetname']
    logger.info('> GSheet Credentials: %s' % gsheet_auth)
    users = dict()
    info = json.loads(gsheet_auth)
    scopes = ['https://www.googleapis.com/auth/spreadsheets']
    credentials = service_account.Credentials.from_service_account_info(info, scopes=scopes)
    gc = pygsheets.authorize(custom_credentials=credentials)
    spreadsheet = gc.open_by_url(spreadsheet_url)
    sheet = spreadsheet.worksheet_by_title(sheetname)
    keys = sheet.get_row(1)
    rows = [keys]
    crisisdata = configuration['crisisdata']
    for crisis in crisisdata:
        data = crisisdata[crisis]
        startdate = parse_date(data['startdate'])
        enddate = startdate + timedelta(days=enddays)
        searchlist = list()
        for country in data.get('countries', list()):
            iso3, _ = Country.get_iso3_country_code_fuzzy(country)
            searchlist.append('groups:%s' % iso3.lower())
        for tag in data.get('tags', list()):
            searchlist.append('vocab_Topics:"%s"' % tag.lower())
        search_string = 'metadata_created:[2000-01-01T00:00:00.000Z TO %sZ] AND (%s)' % (enddate.isoformat(), ' OR '.join(searchlist))
        datasets = Dataset.search_in_hdx(fq=search_string)
        row = {'ID': data['id'], 'Crisis name': crisis}
        count = 0
        largest_activities = 0
        for dataset in datasets:
            metadata_created_str = dataset['metadata_created']
            orgname = dataset['organization']['name']
            metadata_created = parse_date(metadata_created_str)
            new_or_updated = 'new'
            updated_when = ''
            updated_by = ''
            # if metadata_created < startdate:
            #     activities = Activity.get_all_activities(id=dataset['id'], limit=10000)
            #     activities_len = len(activities)
            #     if activities_len > largest_activities:
            #         largest_activities = activities_len
            #     found = False
            #     for activity in activities:
            #         timestamp = activity['timestamp']
            #         activity_date = parse_date(timestamp)
            #         if startdate < activity_date < enddate:
            #             new_or_updated = 'updated'
            #             updated_when = timestamp
            #             user_id = activity['user_id']
            #             check_ignore = True
            #             for user_scrapers in users_scrapers:
            #                 if user_id == user_scrapers['id']:
            #                     if orgname in user_scrapers['scrapers']:
            #                         check_ignore = False
            #                         break
            #             if check_ignore:
            #                 if user_id in ignore_users:
            #                     continue
            #             username = users.get(user_id)
            #             if username is None:
            #                 user = User.read_from_hdx(user_id)
            #                 username = get_user_name(user)
            #                 users[user_id] = username
            #             updated_by = username
            #             found = True
            #             break
            #     if not found:
            #         continue
            row['dataset title'] = dataset['title']
            row['dataset id'] = dataset['id']
            row['dataset url'] = dataset.get_hdx_url()
            row['org name'] = orgname
            row['org id'] = dataset['organization']['id']
            row['created'] = metadata_created_str
            row['new or updated'] = new_or_updated
            row['updated when'] = updated_when
            row['updated by'] = updated_by
            rows.append([row.get(key, '') for key in keys])
            count += 1
        logger.info('%s: %d\t%s' % (crisis, count, search_string))
    sheet.clear()
    sheet.update_values('A1', rows)
    logger.info('Longest activities: %d' % largest_activities)
コード例 #22
0
    def do_set_value(self, row, scrapername=None):
        if self.filtered(row):
            return None, None

        adms = [None for _ in range(len(self.admcols))]

        def get_adm(admcol, i):
            match = template.search(admcol)
            if match:
                template_string = match.group()
                admcol = self.headers[int(template_string[2:-2])]
            adm = row[admcol]
            if not adm:
                return False
            adms[i] = row[admcol].strip()
            return self.admininfo.get_adm(adms, self.admexact, i, scrapername)

        for i, admcol in enumerate(self.admcols):
            if admcol is None:
                continue
            if isinstance(admcol, str):
                admcol = [admcol]
            for admcl in admcol:
                exact = get_adm(admcl, i)
                if adms[i] and exact:
                    break
            if not adms[i]:
                return None, None

        indicators_process = list()
        for indicatorcol in self.indicatorcols:
            filtercol = indicatorcol['filter_col']
            process = True
            if filtercol:
                filtercols = filtercol.split('|')
                match = True
                for filterstr in filtercols:
                    filter = filterstr.split('=')
                    if row[filter[0]] != filter[1]:
                        match = False
                        break
                process = match
            indicators_process.append(process)

        if self.datecol:
            if isinstance(self.datecol, list):
                dates = [str(row[x]) for x in self.datecol]
                date = ''.join(dates)
            else:
                date = row[self.datecol]
            if self.datetype == 'date':
                if not isinstance(date, datetime):
                    date = parse_date(date)
                date = date.replace(tzinfo=None)
            else:
                date = int(date)
            if self.date_condition:
                if eval(self.date_condition) is False:
                    return None, None
            for i, process in enumerate(indicators_process):
                if not process:
                    continue
                if date > self.maxdate:
                    self.maxdate = date
                if self.level is None:
                    if self.maxdateonly:
                        if date < self.maxdates[i]:
                            indicators_process[i] = False
                        else:
                            self.maxdates[i] = date
                    else:
                        self.maxdates[i] = date
                else:
                    if self.maxdateonly:
                        if date < self.maxdates[i][adms[self.level]]:
                            indicators_process[i] = False
                        else:
                            self.maxdates[i][adms[self.level]] = date
                    else:
                        self.maxdates[i][adms[self.level]] = date
        if self.level is None:
            return 'global', indicators_process
        return adms[self.level], indicators_process