def get_soup( url: str, downloader: Download = None, user_agent: Optional[str] = None, user_agent_config_yaml: Optional[str] = None, user_agent_lookup: Optional[str] = None, **kwargs: Any, ) -> BeautifulSoup: """ Get BeautifulSoup object for a url. Requires either global user agent to be set or appropriate user agent parameter(s) to be completed. Args: url (str): url to read downloader (Download): Download object. Defaults to creating a Download object with given user agent values. user_agent (Optional[str]): User agent string. HDXPythonUtilities/X.X.X- is prefixed. user_agent_config_yaml (Optional[str]): Path to YAML user agent configuration. Ignored if user_agent supplied. Defaults to ~/.useragent.yml. user_agent_lookup (Optional[str]): Lookup key for YAML. Ignored if user_agent supplied. Returns: BeautifulSoup: The BeautifulSoup object for a url """ if not downloader: downloader = Download(user_agent, user_agent_config_yaml, user_agent_lookup, **kwargs) response = downloader.download(url) return BeautifulSoup(response.text, "html.parser")
def add_food_prices(configuration, today, countryiso3s, retriever, basic_auths, scrapers=None): name = 'food_prices' if scrapers and not any(scraper in name for scraper in scrapers): return list(), list(), list() datasetinfo = configuration[name] read_hdx_metadata(datasetinfo, today=today) base_url = datasetinfo['base_url'] if retriever.use_saved: headers = None else: basic_auth = basic_auths[name] token_downloader = Download(basic_auth=basic_auth) token_downloader.download( f'{base_url}/token', post=True, parameters={'grant_type': 'client_credentials'}) access_token = token_downloader.get_json()['access_token'] headers = { 'Accept': 'application/json', 'Authorization': f'Bearer {access_token}' } def get_list(endpoint, countryiso3, startdate=None): url = f'{base_url}/{endpoint}' filename = url.split('/')[-2] page = 1 all_data = [] data = None while data is None or len(data) > 0: parameters = {'CountryCode': countryiso3, 'page': page} if startdate: parameters['startDate'] = startdate try: json = retriever.retrieve_json( url, f'{filename}_{countryiso3}_{page}.json', f'{filename} for {countryiso3} page {page}', False, parameters=parameters, headers=headers) except FileNotFoundError: json = {'items': list()} data = json['items'] all_data.extend(data) page = page + 1 return all_data six_months_ago = today - relativedelta(months=6) ratios = dict() category_id_weights = {1: 2, 2: 4, 3: 4, 4: 1, 5: 3, 6: 0.5, 7: 0.5} for countryiso3 in countryiso3s: logger.info(f'Processing {countryiso3}') commodities = get_list('vam-data-bridges/1.1.0/Commodities/List', countryiso3) if not commodities: logger.info(f'{countryiso3} has no commodities!') continue commodity_id_to_category_id = { x['id']: x['categoryId'] for x in commodities } alps = get_list('vam-data-bridges/1.1.0/MarketPrices/Alps', countryiso3, six_months_ago) if not alps: logger.info(f'{countryiso3} has no ALPS!') continue yearmonth_rows = dict() for row in alps: analysis_value_price_flag = row['analysisValuePriceFlag'] if analysis_value_price_flag == 'forecast': continue commodity_id = row['commodityID'] category_id = commodity_id_to_category_id.get(commodity_id) if not category_id or category_id >= 8: continue row['categoryId'] = category_id yearmonth = f'{row["commodityPriceDateYear"]}/{row["commodityPriceDateMonth"]}' dict_of_lists_add(yearmonth_rows, yearmonth, row) yearmonths = yearmonth_rows.keys() if len(yearmonths) == 0: logger.info(f'{countryiso3} has no values!') continue latest_yearmonth = max(yearmonths) commodities_per_market = dict() commodities_per_market_crisis = dict() for row in yearmonth_rows[latest_yearmonth]: market_id = row['marketID'] category_id = row['categoryId'] weighted_value = category_id_weights[category_id] commodities_per_market[market_id] = commodities_per_market.get( market_id, 0) + weighted_value pewivalue = row['analysisValuePewiValue'] if pewivalue >= 1.0: commodities_per_market_crisis[ market_id] = commodities_per_market_crisis.get( market_id, 0) + weighted_value country_ratio = 0 for market_id in commodities_per_market: market_ratio = commodities_per_market_crisis.get( market_id, 0) / commodities_per_market[market_id] country_ratio += market_ratio country_ratio /= len(commodities_per_market) ratios[countryiso3] = number_format(country_ratio, trailing_zeros=False) hxltag = '#value+food+num+ratio' logger.info('Processed WFP') return [['Food Prices Ratio'], [hxltag] ], [ratios], [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url'])]