def url_get(url): """ Download an URL using a cache and return the response object :param url: :return: """ s = None if 'file://' in url: s = requests.session() s.mount('file://', FileAdapter()) elif 'dir://' in url: s = requests.session() s.mount('dir://', DirAdapter()) else: retry = Retry(total=3, backoff_factor=0.5) adapter = HTTPAdapter(max_retries=retry) s = CachedSession(cache_name="pyff_cache", backend=config.request_cache_backend, expire_after=config.request_cache_time, old_data_on_error=True) s.mount('http://', adapter) s.mount('https://', adapter) headers = {'User-Agent': "pyFF/{}".format(__version__), 'Accept': '*/*'} try: r = s.get(url, headers=headers, verify=False, timeout=config.request_timeout) except IOError as ex: s = requests.Session() r = s.get(url, headers=headers, verify=False, timeout=config.request_timeout) if six.PY2: r.encoding = "utf-8" log.debug("url_get({}) returns {} chrs encoded as {}".format(url, len(r.content), r.encoding)) if config.request_override_encoding is not None: r.encoding = config.request_override_encoding return r
def routes(x): # Extract begin and end longitude and latitude of each geometry x0 = Point(x.coords[0]).x y0 = Point(x.coords[0]).y x1 = Point(x.coords[-1]).x y1 = Point(x.coords[-1]).y # Combine into query itinerarypoints = "".join([str(x0), ",", str(y0), "|", str(x1), ",", str(y1)]) url = base_url url += "&itinerarypoints=%s" % itinerarypoints # Retrieve result session = CachedSession('cyclestreets_cache', expire_after=expire) retry = Retry(connect=3, backoff_factor=0.5) adapter = HTTPAdapter(max_retries=retry) session.mount('http://', adapter) session.mount('https://', adapter) req = session.get(url) jsn = req.json() # Convert json into coordinates if "marker" in jsn: coordinates = jsn["marker"]["@attributes"]["coordinates"] coordinates = re.findall(r'[^,\s]+', coordinates) coordinates = list(map(float, coordinates)) elem = iter(coordinates) coordinates = [*zip(elem, elem)] return LineString(coordinates) else: if (x0 == x1): return LineString([(x0,y0), (x1,y1)]) else: warnings.warn("No route found. Please relocate one of the \ centroids closer to a road using the corr_cent function in the geo module. \ The tazce codes of the centroids can be found with find_cent function in this \ module.", Warning, stacklevel=2) return Point(x0,y0)
def get_web_session(cache_storage=None): proxies = None http_proxy = os.getenv('HTTP_PROXY') https_proxy = os.getenv('HTTPS_PROXY') if http_proxy is not None and https_proxy is not None: proxies = {'http': http_proxy, 'https': https_proxy} if cache_storage is not None: o = urlparse(cache_storage) if o.scheme == "mongodb": # these requests-cache internals gymnastics are necessary # because it will not create a database with the desired name otherwise dbname = o.path.replace('/', '') dbconn = MongoClient(cache_storage) session = CachedSession(backend='mongodb') session.cache = MongoCache(connection=dbconn, db_name=dbname) else: session = CachedSession(cache_name=cache_storage, extension='') else: session = Session() retry = Retry(total=10, read=10, connect=10, backoff_factor=0.3, status_forcelist=(500, 502, 504)) adapter = HTTPAdapter(max_retries=retry) session.mount('http://', adapter) session.mount('https://', adapter) session.proxies = proxies session.headers.update({'User-Agent': __useragent__}) return session
def mock_session() -> CachedSession: """Fixture for combining requests-cache with requests-mock. This will behave the same as a CachedSession, except it will make mock requests for ``mock://`` URLs, if it hasn't been cached already. For example, ``mock_session.get(MOCKED_URL)`` will return a mock response on the first call, and a cached mock response on the second call. Additional mock responses can be added via ``mock_session.mock_adapter.register_uri()``. This uses a temporary SQLite db stored in ``/tmp``, which will be removed after the fixture has exited. """ with NamedTemporaryFile(suffix='.db') as temp: session = CachedSession( cache_name=temp.name, backend='sqlite', allowable_methods=ALL_METHODS, suppress_warnings=True, ) adapter = get_mock_adapter() for protocol in MOCK_PROTOCOLS: session.mount(protocol, adapter) session.mock_adapter = adapter yield session
from requests.packages.urllib3.util.retry import Retry from requests.adapters import HTTPAdapter HEADER = ['url', 'link', 'title', 'description', 'content', 'topics', 'organisations'] parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('input_file', nargs='?', type=argparse.FileType('r'), default=sys.stdin) parser.add_argument('--environment', '-e', dest='root_url', default='https://www-origin.staging.publishing.service.gov.uk', help='the environment used to query the search API') parser.add_argument('--skip', '-s', dest='skip', type=int, default=0, help='Number of input rows to skip. Can be used to resume a partially completed import') parser.add_argument('--skip-redirects', '-r', dest='skip_redirects', action='store_true', help="Don't test URLs on GOV.UK to resolve redirected links.") parser.add_argument('--wait-time', '-w', dest='wait_time', type=float, default=0.1, help='Time to wait between each link, to work around rate limiting.') args = parser.parse_args() session = CachedSession(cache_name='govuk_cache', backend='sqlite') retries = Retry(total=5, backoff_factor=args.wait_time, status_forcelist=[ 429 ]) session.mount('http://', HTTPAdapter(max_retries=retries)) session.mount('https://', HTTPAdapter(max_retries=retries)) def test_base_path(original_base_path, args): """ Given a base path, try and classify it as valid, redirected, or gone, so that we can fetch data even when the link has been redirected. If it can't be retrieved, return None, otherwise return the ultimate base path. We might include the same document multiple times in our analysis, but this should only happen for a small amount of links and we can strip out duplicates later. You can pass --skip_redirects flag on the commmand line, to skip this step,
from .exceptions import UnknownLatestVersion if os.name == 'nt': file_uri_offset = 8 else: file_uri_offset = 7 default_minor_version = '1.1' # https://2.python-requests.org/projects/3/api/#requests.adapters.HTTPAdapter # https://urllib3.readthedocs.io/en/latest/advanced-usage.html#customizing-pool-behavior adapter = HTTPAdapter(max_retries=3, pool_maxsize=int(os.getenv('REQUESTS_POOL_MAXSIZE', 10))) session = CachedSession(backend='memory') session.mount('https://', adapter) session.mount('http://', adapter) def json_dump(data, io): """ Dumps JSON to a file-like object. """ json.dump(data, io, ensure_ascii=False, indent=2) def get_latest_version(versions): """ Returns the identifier of the latest version from a list of versions of the same extension. :raises UnknownLatestVersion: if the latest version of the extension can't be determined
import csv import sys import ipdb import json import time import argparse import requests import requests_cache from urlparse import urlparse from BeautifulSoup import BeautifulSoup from requests_cache import CachedSession from requests.adapters import HTTPAdapter session = CachedSession(cache_name='govuk_cache', backend='sqlite') session.mount('http://', HTTPAdapter()) session.mount('https://', HTTPAdapter()) def test_base_path(original_url): """ Given a URL, perform a HEAD request and fetch the actual URL (in case there are redirects) and return that. """ response = session.head(original_url, allow_redirects=True) if 200 <= response.status_code < 300: return response.url elif response.status_code == 429: response.raise_for_status() else:
'-r', dest='skip_redirects', action='store_true', help="Don't test URLs on GOV.UK to resolve redirected links.") parser.add_argument( '--wait-time', '-w', dest='wait_time', type=float, default=0.1, help='Time to wait between each link, to work around rate limiting.') args = parser.parse_args() session = CachedSession(cache_name='govuk_cache', backend='sqlite') retries = Retry(total=5, backoff_factor=args.wait_time, status_forcelist=[429]) session.mount('http://', HTTPAdapter(max_retries=retries)) session.mount('https://', HTTPAdapter(max_retries=retries)) def test_base_path(original_base_path, args): """ Given a base path, try and classify it as valid, redirected, or gone, so that we can fetch data even when the link has been redirected. If it can't be retrieved, return None, otherwise return the ultimate base path. We might include the same document multiple times in our analysis, but this should only happen for a small amount of links and we can strip out duplicates later. You can pass --skip_redirects flag on the commmand line, to skip this step,