Beispiel #1
0
def url_get(url):
    """
    Download an URL using a cache and return the response object
    :param url:
    :return:
    """

    s = None
    if 'file://' in url:
        s = requests.session()
        s.mount('file://', FileAdapter())
    elif 'dir://' in url:
        s = requests.session()
        s.mount('dir://', DirAdapter())
    else:
        retry = Retry(total=3, backoff_factor=0.5)
        adapter = HTTPAdapter(max_retries=retry)
        s = CachedSession(cache_name="pyff_cache",
                          backend=config.request_cache_backend,
                          expire_after=config.request_cache_time,
                          old_data_on_error=True)
        s.mount('http://', adapter)
        s.mount('https://', adapter)

    headers = {'User-Agent': "pyFF/{}".format(__version__), 'Accept': '*/*'}
    try:
        r = s.get(url, headers=headers, verify=False, timeout=config.request_timeout)
    except IOError as ex:
        s = requests.Session()
        r = s.get(url, headers=headers, verify=False, timeout=config.request_timeout)

    if six.PY2:
        r.encoding = "utf-8"

    log.debug("url_get({}) returns {} chrs encoded as {}".format(url, len(r.content), r.encoding))

    if config.request_override_encoding is not None:
        r.encoding = config.request_override_encoding

    return r
Beispiel #2
0
    def routes(x):
# Extract begin and end longitude and latitude of each geometry
        x0 = Point(x.coords[0]).x
        y0 = Point(x.coords[0]).y
        x1 = Point(x.coords[-1]).x
        y1 = Point(x.coords[-1]).y

# Combine into query        
        itinerarypoints = "".join([str(x0), ",", str(y0), "|", str(x1), ",", str(y1)]) 
        url = base_url
        url += "&itinerarypoints=%s" % itinerarypoints

# Retrieve result
        session = CachedSession('cyclestreets_cache', expire_after=expire)
        retry = Retry(connect=3, backoff_factor=0.5)
        adapter = HTTPAdapter(max_retries=retry)
        session.mount('http://', adapter)
        session.mount('https://', adapter)

        req = session.get(url)
        jsn = req.json()

# Convert json into coordinates
        if "marker" in jsn:
            coordinates = jsn["marker"]["@attributes"]["coordinates"]
            coordinates = re.findall(r'[^,\s]+', coordinates)
            coordinates = list(map(float, coordinates))
            elem = iter(coordinates)
            coordinates = [*zip(elem, elem)]
            return LineString(coordinates)
        else:
            if (x0 == x1):
                return LineString([(x0,y0), (x1,y1)])   
            else:
                warnings.warn("No route found. Please relocate one of the \
centroids closer to a road using the corr_cent function in the geo module. \
The tazce codes of the centroids can be found with find_cent function in this \
module.", Warning, stacklevel=2)
                return Point(x0,y0)
Beispiel #3
0
def get_web_session(cache_storage=None):

    proxies = None

    http_proxy = os.getenv('HTTP_PROXY')
    https_proxy = os.getenv('HTTPS_PROXY')

    if http_proxy is not None and https_proxy is not None:
        proxies = {'http': http_proxy, 'https': https_proxy}

    if cache_storage is not None:

        o = urlparse(cache_storage)
        if o.scheme == "mongodb":
            # these requests-cache internals gymnastics are necessary
            # because it will not create a database with the desired name otherwise
            dbname = o.path.replace('/', '')
            dbconn = MongoClient(cache_storage)
            session = CachedSession(backend='mongodb')
            session.cache = MongoCache(connection=dbconn, db_name=dbname)
        else:
            session = CachedSession(cache_name=cache_storage, extension='')
    else:
        session = Session()

    retry = Retry(total=10,
                  read=10,
                  connect=10,
                  backoff_factor=0.3,
                  status_forcelist=(500, 502, 504))
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)

    session.proxies = proxies
    session.headers.update({'User-Agent': __useragent__})

    return session
Beispiel #4
0
def mock_session() -> CachedSession:
    """Fixture for combining requests-cache with requests-mock. This will behave the same as a
    CachedSession, except it will make mock requests for ``mock://`` URLs, if it hasn't been cached
    already.

    For example, ``mock_session.get(MOCKED_URL)`` will return a mock response on the first call,
    and a cached mock response on the second call. Additional mock responses can be added via
    ``mock_session.mock_adapter.register_uri()``.

    This uses a temporary SQLite db stored in ``/tmp``, which will be removed after the fixture has
    exited.
    """
    with NamedTemporaryFile(suffix='.db') as temp:
        session = CachedSession(
            cache_name=temp.name,
            backend='sqlite',
            allowable_methods=ALL_METHODS,
            suppress_warnings=True,
        )
        adapter = get_mock_adapter()
        for protocol in MOCK_PROTOCOLS:
            session.mount(protocol, adapter)
        session.mock_adapter = adapter
        yield session
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

HEADER = ['url', 'link', 'title', 'description', 'content', 'topics', 'organisations']

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('input_file', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
parser.add_argument('--environment', '-e', dest='root_url', default='https://www-origin.staging.publishing.service.gov.uk', help='the environment used to query the search API')
parser.add_argument('--skip', '-s', dest='skip', type=int, default=0, help='Number of input rows to skip. Can be used to resume a partially completed import')
parser.add_argument('--skip-redirects', '-r', dest='skip_redirects', action='store_true', help="Don't test URLs on GOV.UK to resolve redirected links.")
parser.add_argument('--wait-time', '-w', dest='wait_time', type=float, default=0.1, help='Time to wait between each link, to work around rate limiting.')
args = parser.parse_args()

session = CachedSession(cache_name='govuk_cache', backend='sqlite')
retries = Retry(total=5, backoff_factor=args.wait_time, status_forcelist=[ 429 ])
session.mount('http://', HTTPAdapter(max_retries=retries))
session.mount('https://', HTTPAdapter(max_retries=retries))


def test_base_path(original_base_path, args):
    """
    Given a base path, try and classify it as valid, redirected, or gone,
    so that we can fetch data even when the link has been redirected.

    If it can't be retrieved, return None, otherwise return the ultimate base path.

    We might include the same document multiple times in our analysis, but
    this should only happen for a small amount of links and we can strip
    out duplicates later.

    You can pass --skip_redirects flag on the commmand line, to skip this step,
Beispiel #6
0
from .exceptions import UnknownLatestVersion

if os.name == 'nt':
    file_uri_offset = 8
else:
    file_uri_offset = 7

default_minor_version = '1.1'

# https://2.python-requests.org/projects/3/api/#requests.adapters.HTTPAdapter
# https://urllib3.readthedocs.io/en/latest/advanced-usage.html#customizing-pool-behavior
adapter = HTTPAdapter(max_retries=3,
                      pool_maxsize=int(os.getenv('REQUESTS_POOL_MAXSIZE', 10)))
session = CachedSession(backend='memory')
session.mount('https://', adapter)
session.mount('http://', adapter)


def json_dump(data, io):
    """
    Dumps JSON to a file-like object.
    """
    json.dump(data, io, ensure_ascii=False, indent=2)


def get_latest_version(versions):
    """
    Returns the identifier of the latest version from a list of versions of the same extension.

    :raises UnknownLatestVersion: if the latest version of the extension can't be determined
import csv
import sys
import ipdb
import json
import time
import argparse
import requests
import requests_cache
from urlparse import urlparse
from BeautifulSoup import BeautifulSoup
from requests_cache import CachedSession
from requests.adapters import HTTPAdapter


session = CachedSession(cache_name='govuk_cache', backend='sqlite')
session.mount('http://', HTTPAdapter())
session.mount('https://', HTTPAdapter())


def test_base_path(original_url):
    """
    Given a URL, perform a HEAD request and fetch the actual URL (in case there
    are redirects) and return that.
    """
    response = session.head(original_url, allow_redirects=True)

    if 200 <= response.status_code < 300:
        return response.url
    elif response.status_code == 429:
        response.raise_for_status()
    else:
Beispiel #8
0
    '-r',
    dest='skip_redirects',
    action='store_true',
    help="Don't test URLs on GOV.UK to resolve redirected links.")
parser.add_argument(
    '--wait-time',
    '-w',
    dest='wait_time',
    type=float,
    default=0.1,
    help='Time to wait between each link, to work around rate limiting.')
args = parser.parse_args()

session = CachedSession(cache_name='govuk_cache', backend='sqlite')
retries = Retry(total=5, backoff_factor=args.wait_time, status_forcelist=[429])
session.mount('http://', HTTPAdapter(max_retries=retries))
session.mount('https://', HTTPAdapter(max_retries=retries))


def test_base_path(original_base_path, args):
    """
    Given a base path, try and classify it as valid, redirected, or gone,
    so that we can fetch data even when the link has been redirected.

    If it can't be retrieved, return None, otherwise return the ultimate base path.

    We might include the same document multiple times in our analysis, but
    this should only happen for a small amount of links and we can strip
    out duplicates later.

    You can pass --skip_redirects flag on the commmand line, to skip this step,