Example #1
0
    def __init__(self,
                 url="https://pypi.org/",
                 disable_cache=False,
                 fallback=True):
        super(PyPiRepository, self).__init__(url.rstrip("/") + "/simple/")

        self._base_url = url
        self._disable_cache = disable_cache
        self._fallback = fallback

        release_cache_dir = REPOSITORY_CACHE_DIR / "pypi"
        self._cache = CacheManager({
            "default": "releases",
            "serializer": "json",
            "stores": {
                "releases": {
                    "driver": "file",
                    "path": str(release_cache_dir)
                },
                "packages": {
                    "driver": "dict"
                },
            },
        })

        self._cache_control_cache = FileCache(str(release_cache_dir / "_http"))
        self._session = CacheControl(requests.session(),
                                     cache=self._cache_control_cache)

        self._name = "PyPI"
Example #2
0
    def __init__(self,
                 url="https://pypi.org/",
                 disable_cache=False,
                 fallback=True):
        self._url = url
        self._disable_cache = disable_cache
        self._fallback = fallback

        release_cache_dir = Path(CACHE_DIR) / "cache" / "repositories" / "pypi"
        self._cache = CacheManager({
            "default": "releases",
            "serializer": "json",
            "stores": {
                "releases": {
                    "driver": "file",
                    "path": str(release_cache_dir)
                },
                "packages": {
                    "driver": "dict"
                },
            },
        })

        self._cache_control_cache = FileCache(str(release_cache_dir / "_http"))
        self._session = CacheControl(session(),
                                     cache=self._cache_control_cache)
        self._inspector = Inspector()

        super(PyPiRepository, self).__init__()

        self._name = "PyPI"
Example #3
0
def setup_sessions():
    # as CacheControl patches the session object, we can't share
    # one between both
    feed_sess = CacheControl(requests.Session(),
                             cache=FileCache(args.cache + '/feed'))
    #article_sess = CacheControl(session,
    #    cache=FileCache(args.cache + '/forever', forever=True))
    article_sess = CacheControl(
        requests.Session(),
        cache=FileCache(args.cache + '/article'),
        heuristic=cachecontrol.heuristics.ExpiresAfter(days=2 * 365))
    return (feed_sess, article_sess)
Example #4
0
class Offers:
    cache_dir = XDG_CACHE_HOME / 'price-ec2' / 'http'
    session = CacheControl(requests.Session(), cache=FileCache(cache_dir))

    def prices(self, service, region=None):
        def fetch_json(url):
            with progress('fetching ' + url):
                response = self.session.get('https://pricing.us-east-1.amazonaws.com' + url)
                response.raise_for_status()
                return response.json()

        index_url = '/offers/v1.0/aws/index.json'
        index = fetch_json(index_url)
        if region:
            region_index_url = index['offers'][service]['currentRegionIndexUrl']
            region_index = fetch_json(region_index_url)
            region_url = region_index['regions'][region]['currentVersionUrl']
            return fetch_json(region_url)
        else:
            current_url = index['offers'][service]['currentVersionUrl']
            return fetch_json(current_url)

    @lru_cache()
    def ec2(self, region):
        return self.prices('AmazonEC2', region)

    @lru_cache()
    def rds(self, region):
        return self.prices('AmazonRDS', region)

    @lru_cache()
    def elasticache(self, region):
        return self.prices('AmazonElastiCache', region)
Example #5
0
def download(workdir, url):
    """Download a file, using .cache inside workdir as an HTTP cache."""
    logging.debug(u"initializing requests and cache-control")
    session = CacheControl(requests.Session(),
                           cache=FileCache(os.path.join(workdir, '.cache')))
    session.mount('file://', LocalFileAdapter())
    req = session.get(url, stream=True)
    try:
        downloaded_file = tempfile.TemporaryFile()
        size = 0
        start = datetime.datetime.now()
        for chunk in req.iter_content(chunk_size=1024000):
            if chunk:
                sys.stdout.write('.')
                sys.stdout.flush()
                downloaded_file.write(chunk)
                size += len(chunk)
        # print newline
        print()
        downloaded_file.flush()
        logging.info(u"downloaded {} - {} o. in {} s.", url, size,
                     (datetime.datetime.now() - start).total_seconds())
        logging.debug(u"reset file pointer - seek(0)")
        downloaded_file.seek(0)
        return downloaded_file
    except Exception as exc:
        logging.debug(u"error on download, closing and deleting file")
        downloaded_file.close()
        raise exc
Example #6
0
    def __init__(self):
        super(ProjectHolder, self).__init__()

        app_name = __name__.split('.')[0]

        self.cache_dir = user_cache_dir(app_name)
        log.info("Using cache directory: {}.".format(self.cache_dir))
        self.cache = FileCache(self.cache_dir)
        cache_adapter = CacheControlAdapter(cache=self.cache)
        self.mount("http://", cache_adapter)
        self.mount("https://", cache_adapter)

        self.headers.update(
            {'User-Agent': '{}/{}'.format(app_name, __version__)})
        log.info('Created instance of {}'.format(type(self).__name__))
        self.branches = None
        self.only = None
        self.exclude = None
        self.having_asset = None
        self.hostname = None
        # identifies project on a given hostname
        self.repo = None
        # short name for "repo", useful in URLs
        self.name = None
        # in some case we do not specify repo, but feed is discovered, no repo is given then
        self.feed_url = None
Example #7
0
def all_sites(sitemap_url='http://library.link/harvest/sitemap.xml'):
    '''
    >>> from librarylink.util import all_sites
    >>> [ s.host for s in all_sites() if 'denverlibrary' in s.host ]
    ['link.denverlibrary.org']
    '''
    #FIXME: Avoid accumulating all the nodes, which will require improvements to xml.treesequence
    @coroutine
    def sink(accumulator):
        while True:
            e = yield
            loc = next(select_name(e, 'loc'))
            lastmod = next(select_name(e, 'lastmod'))
            s = liblink_site()
            s.sitemap = loc.xml_value
            s.url, _, tail = s.sitemap.partition('harvest/sitemap.xml')
            s.base_url = s.url #Legacy property name
            #Early warning for funky URLs breaking stuff downstream
            assert not tail
            protocol, s.host, path, query, fragment = iri.split_uri_ref(s.sitemap)
            s.lastmod = lastmod.xml_value
            accumulator.append(s)

    nodes = []
    ts = xml.treesequence(('sitemapindex', 'sitemap'), sink(nodes))
    if hasattr (all_sites, 'cachedir'):
        sess = CacheControl(requests.Session(), cache=FileCache(all_sites.cachedir))
    else:
        sess = CacheControl(requests.Session())
    result = sess.get(sitemap_url)
    ts.parse(result.text)
    yield from nodes
Example #8
0
def center_iterator(client=None) -> Iterator[Dict]:
    if not PLATFORM_ENABLED:
        logger.warning(
            f"{PLATFORM.capitalize()} scrap is disabled in configuration file."
        )
        return []

    session = CacheControl(requests.Session(), cache=FileCache("./cache"))

    if client:
        session = client
    try:
        url = f'{get_config().get("base_urls").get("github_public_path")}{get_conf_outputs().get("centers_json_path").format(PLATFORM)}'
        response = session.get(url)
        # Si on ne vient pas des tests unitaires
        if not client:
            if response.from_cache:
                logger.info(
                    f"Liste des centres pour {PLATFORM} vient du cache")
            else:
                logger.info(
                    f"Liste des centres pour {PLATFORM} est une vraie requête")

        data = response.json()
        logger.info(
            f"Found {len(data)} {PLATFORM.capitalize()} centers (external scraper)."
        )
        for center in data:
            yield center
    except Exception as e:
        logger.warning(f"Unable to scrape {PLATFORM} centers: {e}")
Example #9
0
    def __init__(self, name, url):
        if name == "pypi":
            raise ValueError("The name [pypi] is reserved for repositories")

        self._packages = []
        self._name = name
        self._url = url.rstrip("/")
        self._cache_dir = Path(CACHE_DIR) / "cache" / "repositories" / name

        self._cache = CacheManager({
            "default": "releases",
            "serializer": "json",
            "stores": {
                "releases": {
                    "driver": "file",
                    "path": str(self._cache_dir)
                },
                "packages": {
                    "driver": "dict"
                },
                "matches": {
                    "driver": "dict"
                },
            },
        })

        self._session = CacheControl(requests.session(),
                                     cache=FileCache(
                                         str(self._cache_dir / "_http")))
Example #10
0
    def __init__(
        self, name, url, auth=None, disable_cache=False
    ):  # type: (str, str, Optional[Auth], bool) -> None
        if name == "pypi":
            raise ValueError("The name [pypi] is reserved for repositories")

        self._packages = []
        self._name = name
        self._url = url.rstrip("/")
        self._cache_dir = Path(CACHE_DIR) / "cache" / "repositories" / name

        self._cache = CacheManager(
            {
                "default": "releases",
                "serializer": "json",
                "stores": {
                    "releases": {"driver": "file", "path": str(self._cache_dir)},
                    "packages": {"driver": "dict"},
                    "matches": {"driver": "dict"},
                },
            }
        )

        self._session = CacheControl(
            requests.session(), cache=FileCache(str(self._cache_dir / "_http"))
        )

        url_parts = urlparse.urlparse(self._url)
        if not url_parts.username and auth:
            self._session.auth = auth

        self._disable_cache = disable_cache
Example #11
0
def ec2_catalog():
    import requests
    from cachecontrol import CacheControl
    from cachecontrol.caches.file_cache import FileCache

    import logging
    logger = logging.getLogger('isitfit')
    logger.debug("Downloading ec2 catalog (cached to local file)")

    # based on URL = 'http://www.ec2instances.info/instances.json'
    # URL = 's3://...csv'
    # Edit 2019-09-10 use CDN link instead of direct gitlab link
    # URL = 'https://gitlab.com/autofitcloud/www.ec2instances.info-ec2op/raw/master/www.ec2instances.info/t3b_smaller_familyL2.json'
    URL = 'https://cdn.jsdelivr.net/gh/autofitcloud/www.ec2instances.info-ec2op/www.ec2instances.info/t3b_smaller_familyL2.json'

    # cached https://cachecontrol.readthedocs.io/en/latest/
    sess = requests.session()
    cached_sess = CacheControl(sess,
                               cache=FileCache('/tmp/isitfit_ec2info.cache'))
    r = cached_sess.request('get', URL)

    # read catalog, copy from ec2op-cli/ec2op/optimizer/cwDailyMaxMaxCpu
    import json
    j = json.dumps(r.json(), indent=4, sort_keys=True)
    from pandas import read_json
    df = read_json(j, orient='split')

    # Edit 2019-09-13 no need to subsample the columns at this stage
    # df = df[['API Name', 'Linux On Demand cost']]

    df = df.rename(columns={'Linux On Demand cost': 'cost_hourly'})
    # df = df.set_index('API Name') # need to use merge, not index
    return df
Example #12
0
    def __init__(self,
                 url='https://pypi.org/',
                 disable_cache=False,
                 fallback=True):
        self._name = 'PyPI'
        self._url = url
        self._disable_cache = disable_cache
        self._fallback = fallback

        release_cache_dir = Path(CACHE_DIR) / 'cache' / 'repositories' / 'pypi'
        self._cache = CacheManager({
            'default': 'releases',
            'serializer': 'json',
            'stores': {
                'releases': {
                    'driver': 'file',
                    'path': str(release_cache_dir)
                },
                'packages': {
                    'driver': 'dict'
                }
            }
        })

        self._session = CacheControl(
            session(),
            cache=FileCache(str(release_cache_dir / '_http'))
        )
        
        super(PyPiRepository, self).__init__()
Example #13
0
File: gh.py Project: jayvdb/gh
def api_call(endpoint, method, field_name=None):
    endpoint = endpoint.lstrip('/')
    headers = {}
    cache_dir = user_cache_dir("gh")
    log.info("Using cache directory: {}.".format(cache_dir))

    api_token = os.getenv("GITHUB_API_TOKEN")
    if api_token:
        log.info("Using API token")
        headers['Authorization'] = "token {}".format(api_token)

    with CacheControl(requests.Session(), cache=FileCache(cache_dir)) as s:
        s.headers.update(headers)

        if method == 'GET':
            r = s.get('https://api.github.com/{}'.format(endpoint),
                      headers=headers)
            rj = r.json()
            if r.status_code != 200:
                eprint(json.dumps(rj))
                exit(22)
            if field_name:
                if field_name in rj:
                    return (rj[field_name])
                else:
                    exit(23)
            else:
                return json.dumps(r.json())

    s.close()
Example #14
0
    def __init__(self, name, url):
        if name == 'pypi':
            raise ValueError('The name [pypi] is reserved for repositories')

        self._packages = []
        self._name = name
        self._url = url.rstrip('/')
        self._cache_dir = Path(CACHE_DIR) / 'cache' / 'repositories' / name

        self._cache = CacheManager({
            'default': 'releases',
            'serializer': 'json',
            'stores': {
                'releases': {
                    'driver': 'file',
                    'path': str(self._cache_dir)
                },
                'packages': {
                    'driver': 'dict'
                },
                'matches': {
                    'driver': 'dict'
                }
            }
        })

        self._session = CacheControl(requests.session(),
                                     cache=FileCache(
                                         str(self._cache_dir / '_http')))
Example #15
0
    def __init__(self, name, url, disable_cache=False):
        if name == "pypi":
            raise ValueError("The name [pypi] is reserved for repositories")

        self._packages = []
        self._name = name
        self._url = url.rstrip("/")
        self._cache_dir = Path(CACHE_DIR) / "cache" / "repositories" / name

        self._cache = CacheManager(
            {
                "default": "releases",
                "serializer": "json",
                "stores": {
                    "releases": {"driver": "file", "path": str(self._cache_dir)},
                    "packages": {"driver": "dict"},
                    "matches": {"driver": "dict"},
                },
            }
        )

        self._session = CacheControl(
            requests.session(), cache=FileCache(str(self._cache_dir / "_http"))
        )

        url_parts = urlparse.urlparse(self._url)
        if not url_parts.username:
            self._session.auth = get_http_basic_auth(
                Config.create("auth.toml"), self.name
            )

        self._disable_cache = disable_cache
Example #16
0
 def __init__(self, filename=""):
     super(BabelNet, self).__init__()
     if filename == "":
         filename = "babelnet_cache"
     self.mount('https://', CacheControlAdapter(cache=FileCache(filename)))
     self.headers.update({'Accept-Encoding': 'gzip'})
     self.params.update({'key': cfg.babelnet_key})
     self.endpoint = "https://babelnet.io/v4/"
Example #17
0
def get_stored_session():
    global __STORED_SESSION
    create_directory_tree(CACHE_DIR)

    if __STORED_SESSION is None:
        __STORED_SESSION = CacheControl(requests.Session(),
                                        cache=FileCache(CACHE_DIR))
    return __STORED_SESSION
Example #18
0
 def __init__(self, config={}, cache=None):
     self.config = config
     if cache is None:
         # sticky local cache directory for testing
         cache = FileCache(".cache", forever=True)
     self.session = CacheControl(requests.Session(),
                                 cache=cache,
                                 heuristic=ExpiresAfter(days=30))
Example #19
0
    def __init__(self,
                 uri: str = None,
                 session: requests.Session = None,
                 seed: str = None):

        # Airtable and gssutils are using slightly different field names....
        self.meta_field_mapping = {"published": "issued"}

        # Add an explicit on/off for temp scraping (based on presence of dataURL)
        self.temp_scrape = False

        # Use seed if provided
        if seed is not None:
            with open(seed, "r") as f:
                self.seed = json.load(f)
                if "dataURL" in self.seed:
                    logging.warning(
                        "A temporary dataURL has been specified; proceeding with a temp scrape."
                    )
                    self.temp_scrape = True
                if "landingPage" not in self.seed.keys():
                    raise MetadataError(
                        'We always need to provide a "landingPage" via the seed. Either'
                        " it's own or alongside a dataURL for temporary scrapes."
                    )
                uri = self.seed["landingPage"]
        else:
            self.seed = None

        self.uri = uri
        self.dataset = pmdcat.Dataset(uri)
        self.catalog = dcat.Catalog()
        self.dataset.modified = datetime.now(timezone.utc).astimezone()
        self.distributions = []

        if session:
            self.session = session
        elif "RECORD_MODE" in os.environ:
            # don't use cachecontrol, but we'll need to patch the session when used.
            self.session = requests.Session()
        else:
            self.session = CacheControl(
                requests.Session(),
                cache=FileCache(".cache"),
                serializer=BiggerSerializer(),
                heuristic=LastModified(),
            )

        if "JOB_NAME" in os.environ:
            self._base_uri = URIRef("http://gss-data.org.uk")
            self._dataset_id = pathify(os.environ["JOB_NAME"])
        else:
            self._base_uri = BNode()
            parsed_scrape_uri = urlparse(self.uri)
            self._dataset_id = (parsed_scrape_uri.netloc.replace(".", "/") +
                                parsed_scrape_uri.path)
        self.update_dataset_uris()
        self._run()
Example #20
0
def start_http_session():
    # Start the cached HTTP Session.
    # Cache directory will be created if it doesn't exist.
    cache_path = utils.project_path('.cache')
    http_session = CacheControl(requests.Session(),
                                heuristic=CacheHeuristic(),
                                cache=FileCache(cache_path))
    http_session.headers = get_requests_header()
    return http_session
Example #21
0
 def __init__(self, api_user, api_key, aliases):
     s = requests.Session()
     s.headers["x-api-user"] = api_user
     s.headers["x-api-key"] = api_key
     self.s = CacheControl(s, cache=FileCache(str(HTTP_CACHE)))
     self.aliases = aliases
     self.cron_tz = CRON_TZ
     self.cron_time = CRON_TIME
     self.cron_file = CRON_FILE
Example #22
0
File: auth.py Project: alin23/spfy
 def get_session(*args, **kwargs):
     session = OAuth2Session(*args, **kwargs)
     cache_adapter = CacheControlAdapter(
         cache=FileCache(CACHE_FILE),
         pool_connections=config.http.connections,
         pool_maxsize=config.http.connections,
         max_retries=config.http.retries,
     )
     session.mount("http://", cache_adapter)
     return session
Example #23
0
    def __init__(
        self,
        name: str,
        url: str,
        config: Optional[Config] = None,
        disable_cache: bool = False,
        cert: Optional[Path] = None,
        client_cert: Optional[Path] = None,
    ) -> None:
        if name == "pypi":
            raise ValueError("The name [pypi] is reserved for repositories")

        self._packages = []
        self._name = name
        self._url = url.rstrip("/")
        self._client_cert = client_cert
        self._cert = cert
        self._cache_dir = REPOSITORY_CACHE_DIR / name
        self._cache = CacheManager({
            "default": "releases",
            "serializer": "json",
            "stores": {
                "releases": {
                    "driver": "file",
                    "path": str(self._cache_dir)
                },
                "packages": {
                    "driver": "dict"
                },
                "matches": {
                    "driver": "dict"
                },
            },
        })

        self._authenticator = Authenticator(
            config=config or Config(use_environment=True))

        self._session = CacheControl(self._authenticator.session,
                                     cache=FileCache(
                                         str(self._cache_dir / "_http")))

        username, password = self._authenticator.get_credentials_for_url(
            self._url)
        if username is not None and password is not None:
            self._authenticator.session.auth = requests.auth.HTTPBasicAuth(
                username, password)

        if self._cert:
            self._authenticator.session.verify = str(self._cert)

        if self._client_cert:
            self._authenticator.session.cert = str(self._client_cert)

        self._disable_cache = disable_cache
Example #24
0
    def session(self):
        session = self._authenticator.session

        if self._basic_auth:
            session.auth = self._basic_auth

        if self._cert:
            session.verify = str(self._cert)

        if self._client_cert:
            session.cert = str(self._client_cert)

        return CacheControl(session, cache=FileCache(str(self._cache_dir / "_http")))
Example #25
0
def _default_urlgetter(
    cache_dir=_DEFAULT_AWS_PRICING_CACHE_DIR,
    urlgetter=None,
):
    if urlgetter is None:
        from requests import Session
        from cachecontrol import CacheControl
        from cachecontrol.caches.file_cache import FileCache

        urlgetter = CacheControl(
            Session(),
            cache=FileCache(cache_dir),
        )
    return urlgetter
Example #26
0
    def __init__(
        self,
        url: str = "https://pypi.org/",
        disable_cache: bool = False,
        fallback: bool = True,
    ) -> None:
        super(PyPiRepository, self).__init__(url.rstrip("/") + "/simple/")

        self._base_url = url
        self._disable_cache = disable_cache
        self._fallback = fallback

        release_cache_dir = REPOSITORY_CACHE_DIR / "pypi"
        self._cache = CacheManager({
            "default": "releases",
            "serializer": "json",
            "stores": {
                "releases": {
                    "driver": "file",
                    "path": str(release_cache_dir)
                },
                "packages": {
                    "driver": "dict"
                },
            },
        })

        self._cache_control_cache = FileCache(str(release_cache_dir / "_http"))
        inner_session = requests.Session()
        retries = Retry(total=5,
                        backoff_factor=1,
                        status_forcelist=[502, 503, 504])
        inner_session.mount(self._base_url, HTTPAdapter(max_retries=retries))
        self._session = CacheControl(inner_session,
                                     cache=self._cache_control_cache)

        self._name = "PyPI"
Example #27
0
    def __init__(self, uri: str = None, session: requests.Session = None, seed: str = None):

        # Airtable and gssutils are using slightly different field names....
        self.meta_field_mapping = {
            "published": "issued"
        }

        # Add an explicit on/off for temp scraping (based on presence of dataURL)
        self.temp_scrape = False

        # Use seed if provided
        if seed is not None:
            with open(seed, "r") as f:
                self.seed = json.load(f)
                if "dataURL" in self.seed:
                    logging.warning("A temporary dataURL has been specified; proceeding with a temp scrape.")
                    uri = self.seed["dataURL"]
                    self.temp_scrape = True
                elif "landingPage" not in self.seed:
                    raise MetadataError("Aborting, insufficient seed data. No landing page supplied via "
                                        "info.json and no dataURL to use as a fallback.")
                else:
                    uri = self.seed["landingPage"]
        else:
            self.seed = None

        self.uri = uri
        self.dataset = pmdcat.Dataset(uri)
        self.catalog = dcat.Catalog()
        self.dataset.modified = datetime.now(timezone.utc).astimezone()
        self.distributions = []

        if session:
            self.session = session
        else:
            self.session = CacheControl(requests.Session(),
                                        cache=FileCache('.cache'),
                                        serializer=BiggerSerializer(),
                                        heuristic=LastModified())

        if 'JOB_NAME' in os.environ:
            self._base_uri = URIRef('http://gss-data.org.uk')
            self._dataset_id = pathify(os.environ['JOB_NAME'])
        else:
            self._base_uri = BNode()
            parsed_scrape_uri = urlparse(self.uri)
            self._dataset_id = parsed_scrape_uri.netloc.replace('.', '/') + parsed_scrape_uri.path
        self.update_dataset_uris()
        self._run()
Example #28
0
	def __init__(self, app_name: str, expires_after: datetime.timedelta = datetime.timedelta(days=28)):
		self.app_name: str = str(app_name)
		self.cache_dir = PathPlus(platformdirs.user_cache_dir(self.app_name))
		self.cache_dir.maybe_make(parents=True)

		self.session: requests.Session = CacheControl(
				sess=requests.Session(),
				cache=FileCache(self.cache_dir),
				heuristic=ExpiresAfter(
						days=expires_after.days,
						seconds=expires_after.seconds,
						microseconds=expires_after.microseconds,
						),
				adapter_class=RateLimitAdapter
				)
def get_http_session():
    global _http_session

    if _http_session is None:
        _http_session = requests.session()

        if cachecontrol:
            _http_session = cachecontrol.CacheControl(
                _http_session,
                cache=FileCache(
                    user_cache_dir(__appname__, __appauthor__), forever=True
                ),
                heuristic=ExpiresAfter(days=14),
            )

    return _http_session
Example #30
0
 def __init__(self):
     super().__init__()
     self.name = "MAL"
     self.logo_url = 'https://upload.wikimedia.org/wikipedia/commons/7/7a/MyAnimeList_Logo.png'
     self.website_url = 'https://myanimelist.net/'
     client_id = "add1ed488bd218c2e10146345377a0b8"
     url_auth = "https://myanimelist.net/v1/oauth2/authorize"
     url_token = "https://myanimelist.net/v1/oauth2/token"
     self.authenticator = OAuth(self.name, client_id, url_auth, url_token)
     self.requests_session = CacheControl(requests.Session(),
                                          cache=FileCache('.Cache/MAL'),
                                          heuristic=MALHeuristic())
     # self.requests_session = requests.Session()
     self.rate_limiter = AsyncRateLimiter(max_calls=100,
                                          period=1,
                                          callback=limited)