def test_load_empty(self, fixturesfolder): loaderfolder = join(fixturesfolder, "loader") with pytest.raises(LoadError): load_file_to_str(join(loaderfolder, "empty.yml")) with pytest.raises(LoadError): load_yaml(join(loaderfolder, "empty.yml")) with pytest.raises(LoadError): load_json(join(loaderfolder, "empty.json"))
def test_load_file_to_str(self): with temp_dir(folder="test_text") as tmpdir: text_file = join(tmpdir, "text_file.txt") save_str_to_file(TestLoader.text, text_file) result = load_file_to_str(text_file) assert result == TestLoader.text result = load_file_to_str(text_file, strip=True) assert result == TestLoader.expected_text_strip result = load_file_to_str(text_file, replace_newlines=" ") assert result == TestLoader.expected_text_newlines_to_spaces with pytest.raises(IOError): load_file_to_str(join(tmpdir, "NOTEXIST.txt"))
def retrieve_text( self, url, filename, logstr=None, fallback=False, **kwargs ): """Retrieve text Args: url (str): URL to download filename (str): Filename to use for saved file logstr (Optional[str]): Text to use in log string to describe download. Defaults to filename. fallback (bool): Whether to use static fallback if download fails. Defaults to False. **kwargs: Parameters to pass to download call Returns: Union[Dict,List]: The text from the file """ if not logstr: logstr = filename saved_path = join(self.saved_dir, filename) if self.use_saved: logger.info(f"Using saved {logstr} in {saved_path}") text = load_file_to_str(saved_path) else: try: logger.info( f"Downloading {logstr} from {self.get_url_logstr(url)}" ) self.downloader.download(url, **kwargs) text = self.downloader.get_text() if self.save: logger.info(f"Saving {logstr} in {saved_path}") save_str_to_file(text, saved_path) except DownloadError: if not fallback: raise fallback_path = join(self.fallback_dir, filename) logger.exception( f"{logstr} download failed, using static data {fallback_path}!" ) text = load_file_to_str(fallback_path) return text
def load_api_key(path): # type: (str) -> str """ Load HDX api key Args: path (str): Path to HDX key Returns: str: HDX api key """ logger.info('Loading HDX api key from: %s' % path) apikey = load_file_to_str(path) return apikey
def read_or_create_batch(folder: str, batch: Optional[str] = None) -> str: """Get batch or create it if it doesn't exist Args: folder (str): Folder in which to look for or create batch file. batch (Optional[str]): Batch to use if there isn't one in a file already. Returns: str: Batch """ batch_file = join(folder, "batch.txt") if exists(batch_file): batch = load_file_to_str(batch_file, strip=True) logger.info(f"File BATCH = {batch}") else: if not batch: batch = get_uuid() logger.info(f"Generated BATCH = {batch}") save_str_to_file(batch, batch_file) return batch
def __init__(self, auth=None, basicauth=None, basicauthfile=None): # type: (Optional[Tuple[str, str]]) -> None s = requests.Session() if basicauthfile is not None: if basicauth is not None: raise DownloadError('Both basicauth and basicauthfile supplied!') elif auth is not None: raise DownloadError('Both auth and basicauthfile supplied!') else: basicauth = load_file_to_str(basicauthfile) if basicauth is not None: if auth is None: auth = decode(basicauth) else: raise DownloadError('Both auth and basicauth supplied!') s.auth = auth retries = Retry(total=5, backoff_factor=0.4, status_forcelist=[429, 500, 502, 503, 504], raise_on_redirect=True, raise_on_status=True) s.mount('http://', HTTPAdapter(max_retries=retries, pool_connections=100, pool_maxsize=100)) s.mount('https://', HTTPAdapter(max_retries=retries, pool_connections=100, pool_maxsize=100)) self.session = s self.response = None
extras_requirements = {'pandas': ['pandas>=1.2.4']} classifiers = [ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "Natural Language :: English", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", "Programming Language :: Python", "Programming Language :: Python :: 3", "Topic :: Software Development :: Libraries :: Python Modules", ] PublishCommand.version = load_file_to_str(join('src', 'hdx', 'scraper', 'version.txt'), strip=True) setup( name='hdx-python-scraper', description='HDX Python Scraper Library', license='MIT', url='https://github.com/OCHA-DAP/hdx-python-scraper', version=PublishCommand.version, author='Michael Rans', author_email='*****@*****.**', keywords=['HDX', 'API', 'library'], long_description=load_file_to_str('README.md'), long_description_content_type='text/markdown', packages=find_packages(where='src'), package_dir={'': 'src'},
def htmltext(self, fixturesfolder): return load_file_to_str(join(fixturesfolder, "html", "response.html"))
def get_session( user_agent: Optional[str] = None, user_agent_config_yaml: Optional[str] = None, user_agent_lookup: Optional[str] = None, use_env: bool = True, fail_on_missing_file: bool = True, **kwargs: Any, ) -> requests.Session: """Set up and return Session object that is set up with retrying. Requires either global user agent to be set or appropriate user agent parameter(s) to be completed. If the EXTRA_PARAMS or BASIC_AUTH environment variable is supplied, the extra_params* parameters will be ignored. Args: user_agent (Optional[str]): User agent string. HDXPythonUtilities/X.X.X- is prefixed. user_agent_config_yaml (Optional[str]): Path to YAML user agent configuration. Ignored if user_agent supplied. Defaults to ~/.useragent.yml. user_agent_lookup (Optional[str]): Lookup key for YAML. Ignored if user_agent supplied. use_env (bool): Whether to read environment variables. Defaults to True. fail_on_missing_file (bool): Raise an exception if any specified configuration files are missing. Defaults to True. **kwargs: See below auth (Tuple[str, str]): Authorisation information in tuple form (user, pass) OR basic_auth (str): Authorisation information in basic auth string form (Basic xxxxxxxxxxxxxxxx) OR basic_auth_file (str): Path to file containing authorisation information in basic auth string form (Basic xxxxxxxxxxxxxxxx) extra_params_dict (Dict): Extra parameters to put on end of url as a dictionary OR extra_params_json (str): Path to JSON file containing extra parameters to put on end of url OR extra_params_yaml (str): Path to YAML file containing extra parameters to put on end of url extra_params_lookup (str): Lookup key for parameters. If not given assumes parameters are at root of the dict. headers (Dict): Additional headers to add to request. status_forcelist (iterable): HTTP statuses for which to force retry. Defaults to [429, 500, 502, 503, 504]. allowed_methods (iterable): HTTP methods for which to force retry. Defaults t0 frozenset(['GET']). """ s = requests.Session() ua = kwargs.get("full_agent") if not ua: ua = UserAgent.get(user_agent, user_agent_config_yaml, user_agent_lookup, **kwargs) s.headers["User-Agent"] = ua auths_found = list() headers = kwargs.get("headers") if headers is not None: s.headers.update(headers) if "Authorization" in headers: auths_found.append("headers") extra_params_found = False extra_params_dict = None basic_auth = None if use_env: basic_auth_env = os.getenv("BASIC_AUTH") if basic_auth_env: basic_auth = basic_auth_env auths_found.append("basic_auth environment variable") extra_params = os.getenv("EXTRA_PARAMS") if extra_params: if "=" in extra_params: extra_params_dict = dict() logger.info( "Loading extra parameters from environment variable") for extra_param in extra_params.split(","): key, value = extra_param.split("=") extra_params_dict[key] = value extra_params_found = True if not extra_params_found: # only do this if extra params env vars not supplied extra_params_dict = kwargs.get("extra_params_dict") if extra_params_dict: extra_params_found = True logger.info("Loading extra parameters from dictionary") extra_params_json = kwargs.get("extra_params_json", "") if extra_params_json: if extra_params_found: raise SessionError( "More than one set of extra parameters given!") extra_params_found = True logger.info(f"Loading extra parameters from: {extra_params_json}") try: extra_params_dict = load_json(extra_params_json) except OSError: if fail_on_missing_file: raise extra_params_yaml = kwargs.get("extra_params_yaml", "") if extra_params_yaml: if extra_params_found: raise SessionError( "More than one set of extra parameters given!") logger.info(f"Loading extra parameters from: {extra_params_yaml}") try: extra_params_dict = load_yaml(extra_params_yaml) except OSError: if fail_on_missing_file: raise extra_params_lookup = kwargs.get("extra_params_lookup") if extra_params_lookup and extra_params_dict: extra_params_dict = extra_params_dict.get(extra_params_lookup) if extra_params_dict is None: raise SessionError( f"{extra_params_lookup} does not exist in extra_params!") if extra_params_dict: basic_auth_param = extra_params_dict.get("basic_auth") if basic_auth_param: basic_auth = basic_auth_param auths_found.append("basic_auth parameter") del extra_params_dict["basic_auth"] s.params = extra_params_dict basic_auth_arg = kwargs.get("basic_auth") if basic_auth_arg: basic_auth = basic_auth_arg auths_found.append("basic_auth argument") auth = kwargs.get("auth") if auth: auths_found.append("auth argument") basic_auth_file = kwargs.get("basic_auth_file") if basic_auth_file: logger.info(f"Loading basic auth from: {basic_auth_file}") try: basic_auth = load_file_to_str(basic_auth_file, strip=True) auths_found.append(f"file {basic_auth_file}") except OSError: if fail_on_missing_file: raise if len(auths_found) > 1: auths_found_str = ", ".join(auths_found) raise SessionError( f"More than one authorisation given! ({auths_found_str})") if "headers" not in auths_found: if basic_auth: auth = decode(basic_auth) s.auth = auth status_forcelist = kwargs.get("status_forcelist", [429, 500, 502, 503, 504]) allowed_methods = kwargs.get( "allowed_methods", frozenset(["HEAD", "TRACE", "GET", "PUT", "OPTIONS", "DELETE"]), ) retries = Retry( total=5, backoff_factor=0.4, status_forcelist=status_forcelist, allowed_methods=allowed_methods, raise_on_redirect=True, raise_on_status=True, ) s.mount("file://", FileAdapter()) s.mount( "http://", HTTPAdapter(max_retries=retries, pool_connections=100, pool_maxsize=100), ) s.mount( "https://", HTTPAdapter(max_retries=retries, pool_connections=100, pool_maxsize=100), ) return s
def test_progress_storing_tempdir(self, monkeypatch): tempfolder = "papa" expected_dir = join(gettempdir(), tempfolder) rmtree(expected_dir, ignore_errors=True) iterator = [ { "iso3": "AFG", "name": "Afghanistan" }, { "iso3": "SDN", "name": "Sudan" }, { "iso3": "YEM", "name": "Yemen" }, { "iso3": "ZAM", "name": "Zambia" }, ] expected_batch_file = join(expected_dir, "batch.txt") result = list() for info, nextdict in progress_storing_tempdir(tempfolder, iterator, "iso3"): assert info["folder"] == expected_dir expected_batch = load_file_to_str(expected_batch_file, strip=True) result.append(nextdict) assert result == iterator assert expected_batch == info["batch"] assert exists(expected_dir) is False monkeypatch.setenv("WHERETOSTART", "iso3=SDN") result = list() for info, nextdict in progress_storing_tempdir(tempfolder, iterator, "iso3"): assert exists(info["folder"]) is True assert info["folder"] == expected_dir expected_batch = load_file_to_str(expected_batch_file, strip=True) result.append(nextdict) assert result == iterator[1:] assert expected_batch == info["batch"] assert exists(expected_dir) is False monkeypatch.delenv("WHERETOSTART") try: for info, nextdict in progress_storing_tempdir( tempfolder, iterator, "iso3"): if nextdict["iso3"] == "YEM": start_batch = info["batch"] raise ValueError("Problem!") except ValueError: pass assert exists(expected_dir) is True result = list() for info, nextdict in progress_storing_tempdir(tempfolder, iterator, "iso3"): assert exists(info["folder"]) is True assert info["folder"] == expected_dir assert info["batch"] == start_batch result.append(nextdict) assert result == iterator[2:] assert exists(expected_dir) is False try: for info, nextdict in progress_storing_tempdir( tempfolder, iterator, "iso3"): if nextdict["iso3"] == "YEM": start_batch = info["batch"] raise ValueError("Problem!") except ValueError: pass assert exists(expected_dir) is True monkeypatch.setenv("WHERETOSTART", "RESET") result = list() for info, nextdict in progress_storing_tempdir(tempfolder, iterator, "iso3"): assert exists(info["folder"]) is True assert info["folder"] == expected_dir assert info["batch"] != start_batch result.append(nextdict) assert result == iterator assert exists(expected_dir) is False monkeypatch.delenv("WHERETOSTART") try: for info, nextdict in progress_storing_tempdir( tempfolder, iterator, "iso3"): if nextdict["iso3"] == "YEM": start_batch = info["batch"] raise ValueError("Problem!") except ValueError: pass assert exists(expected_dir) is True monkeypatch.setenv("WHERETOSTART", "iso3=SDN") result = list() for info, nextdict in progress_storing_tempdir(tempfolder, iterator, "iso3"): assert exists(info["folder"]) is True assert info["folder"] == expected_dir assert info["batch"] == start_batch result.append(nextdict) assert result == iterator[1:] assert exists(expected_dir) is False monkeypatch.delenv("WHERETOSTART") try: for info, nextdict in progress_storing_tempdir( tempfolder, iterator, "iso3"): if nextdict["iso3"] == "YEM": start_batch = info["batch"] raise ValueError("Problem!") except ValueError: pass monkeypatch.setenv("WHERETOSTART", "iso3=NOTFOUND") found = False for _ in progress_storing_tempdir(tempfolder, iterator, "iso3"): found = True assert found is False assert exists(expected_dir) is True batch = load_file_to_str(expected_batch_file, strip=True) assert batch == start_batch monkeypatch.delenv("WHERETOSTART") monkeypatch.setenv("WHERETOSTART", "NOTFOUND=SDN") found = False for _ in progress_storing_tempdir(tempfolder, iterator, "iso3"): found = True assert found is False assert exists(expected_dir) is True batch = load_file_to_str(expected_batch_file, strip=True) assert batch == start_batch monkeypatch.delenv("WHERETOSTART") rmtree(expected_dir, ignore_errors=True)
def htmltext(self, fixturesfolder): return load_file_to_str(join(fixturesfolder, 'html', 'response.html'))
def get_session(**kwargs): # type: (Any) -> requests.Session """Set up and return Session object that is set up with retrying Args: **kwargs: See below auth (Tuple[str, str]): Authorisation information in tuple form (user, pass) OR basic_auth (str): Authorisation information in basic auth string form (Basic xxxxxxxxxxxxxxxx) OR basic_auth_file (str): Path to file containing authorisation information in basic auth string form (Basic xxxxxxxxxxxxxxxx) extra_params_dict (Dict): Extra parameters to put on end of url as a dictionary OR extra_params_json (str): Path to JSON file containing extra parameters to put on end of url OR extra_params_yaml (str): Path to YAML file containing extra parameters to put on end of url extra_params_lookup (str): Lookup key for parameters. If not given assumes parameters are at root of the dict. status_forcelist (iterable): HTTP statuses for which to force retry. Defaults to [429, 500, 502, 503, 504]. method_whitelist (iterable): HTTP methods for which to force retry. Defaults t0 frozenset(['GET']). """ s = requests.Session() extra_params_found = False extra_params_dict = kwargs.get('extra_params_dict', None) if extra_params_dict: extra_params_found = True logger.info('Loading extra parameters from dictionary') extra_params_json = kwargs.get('extra_params_json', '') if extra_params_json: if extra_params_found: raise SessionError('More than one set of extra parameters given!') extra_params_found = True logger.info('Loading extra parameters from: %s' % extra_params_json) extra_params_dict = load_json(extra_params_json) extra_params_yaml = kwargs.get('extra_params_yaml', '') if extra_params_found: if extra_params_yaml: raise SessionError('More than one set of extra parameters given!') else: if extra_params_yaml: logger.info('Loading extra parameters from: %s' % extra_params_yaml) extra_params_dict = load_yaml(extra_params_yaml) else: extra_params_dict = dict() extra_params_lookup = kwargs.get('extra_params_lookup') if extra_params_lookup: extra_params_dict = extra_params_dict.get(extra_params_lookup) if extra_params_dict is None: raise SessionError('%s does not exist in extra_params!' % extra_params_lookup) auth_found = False basic_auth = extra_params_dict.get('basic_auth') if basic_auth: logger.info('Loading authorisation from basic_auth parameter') auth_found = True del extra_params_dict['basic_auth'] s.params = extra_params_dict auth = kwargs.get('auth') if auth: if auth_found: raise SessionError('More than one authorisation given!') logger.info('Loading authorisation from auth argument') auth_found = True bauth = kwargs.get('basic_auth') if bauth: if auth_found: raise SessionError('More than one authorisation given!') logger.info('Loading authorisation from basic_auth argument') basic_auth = bauth auth_found = True basic_auth_file = kwargs.get('basic_auth_file') if basic_auth_file: if auth_found: raise SessionError('More than one authorisation given!') logger.info('Loading authorisation from: %s' % basic_auth_file) basic_auth = load_file_to_str(basic_auth_file) if basic_auth: auth = decode(basic_auth) s.auth = auth status_forcelist = kwargs.get('status_forcelist', [429, 500, 502, 503, 504]) method_whitelist = kwargs.get( 'method_whitelist', frozenset(['HEAD', 'TRACE', 'GET', 'PUT', 'OPTIONS', 'DELETE'])) retries = Retry(total=5, backoff_factor=0.4, status_forcelist=status_forcelist, method_whitelist=method_whitelist, raise_on_redirect=True, raise_on_status=True) s.mount( 'http://', HTTPAdapter(max_retries=retries, pool_connections=100, pool_maxsize=100)) s.mount( 'https://', HTTPAdapter(max_retries=retries, pool_connections=100, pool_maxsize=100)) return s
def multiple_progress_storing_tempdir( folder: str, iterators: List[Iterable[Dict]], keys: List[str], batch: Optional[str] = None, ) -> Tuple[Dict, Dict]: """Store progress in temporary directory. The folder persists until the final iteration of the last iterator allowing which iteration to start at and the batch code to be persisted between runs. Yields 2 dictionaries. The first contains key folder which is the temporary directory optionally with folder appended (and created if it doesn't exist). In key progress is held the current position in the iterator. It also contains the key batch containing a batch code to be passed as the batch parameter in create_in_hdx or update_in_hdx calls. The second dictionary is the next dictionary in the iterator. The WHERETOSTART environment variable can be set to RESET to force the deletion and recreation of the temporary directory or to a key value pair in the form key=value eg. iso3=PAK indicating where to start. Args: folder (str): Folder to create in temporary folder iterators (List[Iterable[Dict]): Iterate over each iterator in the list consecutively persisting progress keys (List[str]): Key to examine from dictionary from each iterator in the above list batch (Optional[str]): Batch to use if there isn't one in a file already. Returns: Tuple[int, Dict,Dict]: A tuple of the form (iterator index, info dictionary, next object in iterator) """ delete_if_exists = False wheretostartenv = getenv("WHERETOSTART") if wheretostartenv: if wheretostartenv.upper() == "RESET": delete_if_exists = True logger.info( "Removing progress file and will start from beginning!" ) with temp_dir_batch( folder, delete_if_exists, delete_on_success=True, delete_on_failure=False, batch=batch, ) as info: tempdir = info["folder"] batch = info["batch"] for i, key in enumerate(keys): progress_file = join(tempdir, "progress.txt") if wheretostartenv: wheretostart = get_wheretostart( wheretostartenv, "Environment variable", key ) else: if exists(progress_file): contents = load_file_to_str(progress_file, strip=True) wheretostart = get_wheretostart(contents, "File", key) else: wheretostart = None with temp_dir_batch( str(i), False, delete_on_success=True, delete_on_failure=False, batch=batch, tempdir=tempdir, ) as info: for info, nextdict in progress_storing_folder( info, iterators[i], key, wheretostart ): save_str_to_file(info["progress"], progress_file) yield i, info, nextdict if exists(progress_file): remove(progress_file)
def progress_storing_folder( info: Dict, iterator: Iterable[Dict], key: str, wheretostart: Optional[str] = None, ) -> Tuple[Dict, Dict]: """Store progress in folder in key folder of info dictionary parameter. Yields 2 dictionaries. The first is the info dictionary. It contains in key folder the folder being used to store progress and in key progress the current position in the iterator. If store_batch is True, that dictionary will also contain the key batch containing a batch code to be passed as the batch parameter in create_in_hdx or update_in_hdx calls. The second dictionary is the next dictionary in the iterator. Args: info (Dict): Dictionary containing folder and anything else to be yielded iterator (Iterable[Dict]): Iterate over this object persisting progress key (str): Key to examine from dictionary from iterator wheretostart (Optional[str]): Where in iterator to start Returns: Tuple[Dict,Dict]: A tuple of the form (info dictionary, next object in iterator) """ folder = info["folder"] progress_file = join(folder, "progress.txt") if not wheretostart: contents = getenv("WHERETOSTART") if contents: wheretostart = get_wheretostart( contents, "Environment variable", key ) else: if exists(progress_file): contents = load_file_to_str(progress_file, strip=True) wheretostart = get_wheretostart(contents, "File", key) else: wheretostart = None found = False for nextdict in iterator: current = nextdict[key] if wheretostart: if wheretostart == "IGNORE": continue if not found: if current == wheretostart: found = True logger.info( f"Starting run from WHERETOSTART {wheretostart}" ) else: logger.info( "Run not started. Ignoring {}. WHERETOSTART ({}) not matched.".format( current, wheretostart ) ) continue output = f"{key}={current}" info["progress"] = output save_str_to_file(output, progress_file) yield info, nextdict if wheretostart and not found: raise NotFoundError( f"WHERETOSTART ({wheretostart}) not matched in iterator with key {key} and no run started!" )