Example #1
0
    def request(self, method, url, headers=None, params=None, proxies=None, cache=True, verify=False, *args, **kwargs):
        if headers is None: headers = {}
        if params is None: params = {}
        if proxies is None: proxies = {}

        url = self.normalize_url(url)

        headers.update({'Accept-Encoding': 'gzip, deflate'})
        headers.update(random.choice(USER_AGENTS))

        # request session ssl verify
        if sickrage.srCore.srConfig.SSL_VERIFY:
            try:
                verify = certifi.where()
            except:
                pass

        # request session proxies
        if 'Referer' not in headers and sickrage.srCore.srConfig.PROXY_SETTING:
            sickrage.srCore.srLogger.debug("Using global proxy: " + sickrage.srCore.srConfig.PROXY_SETTING)
            scheme, address = urllib2.splittype(sickrage.srCore.srConfig.PROXY_SETTING)
            address = ('http://{}'.format(sickrage.srCore.srConfig.PROXY_SETTING),
                       sickrage.srCore.srConfig.PROXY_SETTING)[scheme]
            proxies.update({"http": address, "https": address})
            headers.update({'Referer': address})

        # setup session caching
        if cache:
            cache_file = os.path.abspath(os.path.join(sickrage.DATA_DIR, 'sessions.db'))
            self.__class__ = cachecontrol.CacheControl(self,
                                                       cache=DBCache(cache_file),
                                                       heuristic=ExpiresAfter(days=7)).__class__

        # get web response
        response = super(srSession, self).request(method,
                                                  url,
                                                  headers=headers,
                                                  params=params,
                                                  verify=verify,
                                                  proxies=proxies,
                                                  *args, **kwargs)

        try:
            # check web response for errors
            response.raise_for_status()
        except requests.exceptions.SSLError as e:
            if ssl.OPENSSL_VERSION_INFO < (1, 0, 1, 5):
                sickrage.srCore.srLogger.info(
                    "SSL Error requesting url: '{}' You have {}, try upgrading OpenSSL to 1.0.1e+".format(
                        e.request.url, ssl.OPENSSL_VERSION))

            if sickrage.srCore.srConfig.SSL_VERIFY:
                sickrage.srCore.srLogger.info(
                    "SSL Error requesting url: '{}', try disabling cert verification in advanced settings".format(
                        e.request.url))
        except Exception:
            pass

        return response
Example #2
0
 def __init__(self, config={}, cache=None):
     self.config = config
     if cache is None:
         # sticky local cache directory for testing
         cache = FileCache(".cache", forever=True)
     self.session = CacheControl(requests.Session(),
                                 cache=cache,
                                 heuristic=ExpiresAfter(days=30))
Example #3
0
    def request(self,
                method,
                url,
                headers=None,
                params=None,
                cache=True,
                raise_exceptions=True,
                *args,
                **kwargs):
        url = self.normalize_url(url)
        kwargs.setdefault('params', {}).update(params or {})
        kwargs.setdefault('headers', {}).update(headers or {})

        # if method == 'POST':
        #    self.session.headers.update({"Content-type": "application/x-www-form-urlencoded"})
        kwargs.setdefault('headers',
                          {}).update({'Accept-Encoding': 'gzip, deflate'})
        kwargs.setdefault('headers', {}).update(random.choice(USER_AGENTS))

        # request session ssl verify
        kwargs['verify'] = False
        if sickrage.srCore.srConfig.SSL_VERIFY:
            try:
                kwargs['verify'] = certifi.where()
            except:
                pass

        # request session proxies
        if 'Referer' not in kwargs.get(
                'headers', {}) and sickrage.srCore.srConfig.PROXY_SETTING:
            sickrage.srCore.srLogger.debug(
                "Using global proxy: " +
                sickrage.srCore.srConfig.PROXY_SETTING)
            scheme, address = urllib2.splittype(
                sickrage.srCore.srConfig.PROXY_SETTING)
            address = \
            ('http://{}'.format(sickrage.srCore.srConfig.PROXY_SETTING), sickrage.srCore.srConfig.PROXY_SETTING)[scheme]
            kwargs.setdefault('proxies', {}).update({
                "http": address,
                "https": address
            })
            kwargs.setdefault('headers', {}).update({'Referer': address})

        # setup session caching
        if cache:
            cache_file = os.path.abspath(
                os.path.join(sickrage.DATA_DIR, 'sessions.db'))
            cachecontrol.CacheControl(self,
                                      cache=DBCache(cache_file),
                                      heuristic=ExpiresAfter(days=7))

        # get result
        response = super(srSession, self).request(method, url, *args,
                                                  **kwargs).result()
        if raise_exceptions:
            response.raise_for_status()

        return response
Example #4
0
def set_cache_expiration_delay(seconds):
    """
    Set a cache for requests with a given expiration time.
    """
    adapter = CacheControlAdapter(
        heuristic=ExpiresAfter(seconds=seconds)
    )
    session.mount('http://', adapter)
    return session
Example #5
0
 def __init__(self):
     self.s = requests.Session()
     # We cache ALL responses for 60 min. so eg. inline lyrics request don't make two calls right after each other.
     # This MAY have unforeseen consequences, but hopefully we can deal with those.
     self.s = CacheControl(self.s,
                           cache_etags=False,
                           heuristic=ExpiresAfter(minutes=60))
     self.s.headers.update({
         'Accept': 'application/json',
         'User-Agent': VOCADB_USER_AGENT
     })
     self.opts = {'nameMatchMode': 'Auto', 'getTotalCount': 'true'}
     self._resources = {}
Example #6
0
    def __init__(self,
                 place_id,
                 cache=True,
                 time_to_cache=SCHEDULE_CACHE_TIMING,
                 **kwargs):
        """ Initialize! """
        if cache:
            session = CacheControl(
                Session(), heuristic=ExpiresAfter(seconds=time_to_cache))
        else:
            session = None

        super(TrashClient, self).__init__(BASE_URL, session, **kwargs)
        self._place_id = place_id
Example #7
0
	def __init__(self, app_name: str, expires_after: datetime.timedelta = datetime.timedelta(days=28)):
		self.app_name: str = str(app_name)
		self.cache_dir = PathPlus(platformdirs.user_cache_dir(self.app_name))
		self.cache_dir.maybe_make(parents=True)

		self.session: requests.Session = CacheControl(
				sess=requests.Session(),
				cache=FileCache(self.cache_dir),
				heuristic=ExpiresAfter(
						days=expires_after.days,
						seconds=expires_after.seconds,
						microseconds=expires_after.microseconds,
						),
				adapter_class=RateLimitAdapter
				)
def get_http_session():
    global _http_session

    if _http_session is None:
        _http_session = requests.session()

        if cachecontrol:
            _http_session = cachecontrol.CacheControl(
                _http_session,
                cache=FileCache(
                    user_cache_dir(__appname__, __appauthor__), forever=True
                ),
                heuristic=ExpiresAfter(days=14),
            )

    return _http_session
Example #9
0
 def __init__(self, url, max_retries, expires_after_sec):
     self.hasError = False
     self.fromCache = False
     self.url = url
     self.cacheEnabled = expires_after_sec > 0
     self.session = requests.Session()
     retryPolicy = urllib3.util.Retry(max_retries, status_forcelist=[400])
     if self.cacheEnabled:
         self.session.mount(
             url,
             CacheControlAdapter(
                 max_retries=retryPolicy,
                 heuristic=ExpiresAfter(seconds=expires_after_sec)))
     else:
         self.session.mount(
             url, requests.adapters.HTTPAdapter(max_retries=retryPolicy))
Example #10
0
File: http.py Project: betagouv/zam
def includeme(config: Configurator) -> None:
    """
    Called automatically via config.include("zam_repondeur.services.fetch.http")
    """
    session = requests.session()
    http_cache_dir = config.registry.settings["zam.http_cache_dir"]
    http_cache_duration = int(
        config.registry.settings["zam.http_cache_duration"])
    cached_session = CacheControl(
        session,
        cache=FileCache(http_cache_dir),
        heuristic=ExpiresAfter(minutes=http_cache_duration),
        controller_class=CustomCacheController,
    )
    config.registry.registerUtility(component=cached_session,
                                    provided=IHTTPSession)
Example #11
0
    def fromParameters(
            cls, sessionFactory: Callable[[],
                                          requests.Session], cachePath: str,
            maxAgeDictionary: Mapping[str, int]) -> 'IntersphinxCache':
        """
        Construct an instance with the given parameters.

        @param sessionFactory: A zero-argument L{callable} that
            returns a L{requests.Session}.
        @param cachePath: Path of the cache directory.
        @param maxAgeDictionary: A mapping describing the maximum
            age of any cache entry.
        @see: L{parseMaxAge}
        """
        session = CacheControl(sessionFactory(),
                               cache=FileCache(cachePath),
                               heuristic=ExpiresAfter(**maxAgeDictionary))
        return cls(session)
Example #12
0
    def fromParameters(cls, sessionFactory, cachePath, maxAgeDictionary):
        """
        Construct an instance with the given parameters.

        @param sessionFactory: A zero-argument L{callable} that
            returns a L{requests.Session}.

        @param cachePath: Path of the cache directory.
        @type cachePath: L{str}

        @param maxAgeDictionary: A dictionary describing the maximum
            age of any cache entry.
        @type maxAgeDictionary: L{dict}

        @see: L{parseMaxAge}
        """
        session = CacheControl(sessionFactory(),
                               cache=FileCache(cachePath),
                               heuristic=ExpiresAfter(**maxAgeDictionary))
        return cls(session)
Example #13
0
def get_feed(feed_url):
    """
    Return feed parsed feed
    """
    requests_timeout = getattr(settings, 'FEED_TIMOUT', 1)

    cache_adapter = CacheControlAdapter(
        cache=FileCache('.web_cache'),
        heuristic=ExpiresAfter(hours=1),
    )

    session = requests.Session()
    session.mount('http://', cache_adapter)
    session.mount('https://', cache_adapter)

    show_exceptions = getattr(settings, 'DEBUG', True)

    feed_request = session.get(feed_url, timeout=requests_timeout)

    return feedparser.parse(feed_request.text)
Example #14
0
def get_session(target='http://', heuristic=None):
    """
    Gets a pre-configured :mod:`requests` session.

    This function configures the following behavior into the session :

    - Proxy settings are added to the session.
    - It is configured to use the instance's :data:`requests_cache`.
    - Permanent redirect caching is handled by :mod:`CacheControl`.
    - Temporary redirect caching is not supported.

    Each module / class instance which uses this should subsequently
    maintain it's own session with whatever modifications it requires
    within a scope which makes sense for the use case (and probably close
    it when it's done).

    The session returned from here uses the instance's REQUESTS_CACHE with
    a single - though configurable - heuristic. If additional caches or
    heuristics need to be added, it's the caller's problem to set them up.

    .. note::
        The caching here seems to be pretty bad, particularly for digikey
        passive component search. I don't know why.

    :param target: Defaults to ``'http://'``. string containing a prefix
                   for the targets that should be cached. Use this to setup
                   site-specific heuristics.
    :param heuristic: The heuristic to use for the cache adapter.
    :type heuristic: :class:`cachecontrol.heuristics.BaseHeuristic`
    :rtype: :class:`requests.Session`

    """

    s = requests.session()
    if _proxy_dict is not None:
        s.proxies.update(_proxy_dict)
    if heuristic is None:
        heuristic = ExpiresAfter(seconds=MAX_AGE_DEFAULT)
    s.mount(target, _get_requests_cache_adapter(heuristic))
    return s
Example #15
0
def session_factory(cookie_string=None,
                    max_workers=10,
                    cache_dir=None,
                    cache_days=7,
                    cache_forever=False):
    session = requests.Session()

    if cookie_string:
        session.cookies = cookiejar_from_str(cookie_string)

    if cache_dir:
        logger.debug('Using CacheControl: dir=%r, days=%r, forever=%r',
                     cache_dir, cache_days, cache_forever)
        session = CacheControl(session,
                               cache=FileCache(cache_dir,
                                               forever=cache_forever),
                               heuristic=ExpiresAfter(days=cache_days))

    session = ProgressFuturesSession(max_workers=max_workers, session=session)

    logger.debug('%s with cookies: %s',
                 type(session).__name__, session.cookies)

    return session
Example #16
0
    def _build_session(self, max_retries):
        from requests.adapters import HTTPAdapter

        if not isinstance(max_retries, int):
            raise ValueError(f'int expected, found {type(max_retries)}.')
        elif max_retries < 1:
            raise ValueError('max_retries should be greater or equal to 1.')

        session = requests.Session()

        # mount retries adapter
        session.mount(
            'http://',
            HTTPAdapter(max_retries=Retry(total=max_retries,
                                          method_whitelist=frozenset(
                                              ['GET', 'POST']))))

        # mount cache adapter
        session.mount('http://',
                      CacheControlAdapter(heuristic=ExpiresAfter(hours=1)))

        session.headers['User-Agent'] = USER_AGENT

        self.session = session
#     language: python
#     name: python3
# ---

# +
from gssutils import *
from requests import Session
from cachecontrol import CacheControl
from cachecontrol.caches.file_cache import FileCache
from cachecontrol.heuristics import ExpiresAfter

scraper = Scraper(
    'https://statswales.gov.wales/Catalogue/Housing/Social-Housing-Stock-and-Rents/averageweeklyrentsinstockatsocialrent-by-area-accommodation-providertype',
    session=CacheControl(Session(),
                         cache=FileCache('.cache'),
                         heuristic=ExpiresAfter(days=7)))
scraper
# -

if len(scraper.distributions) == 0:
    from gssutils.metadata import Distribution
    dist = Distribution(scraper)
    dist.title = 'Dataset'
    dist.downloadURL = 'http://open.statswales.gov.wales/dataset/hous0601'
    dist.mediaType = 'application/json'
    scraper.distributions.append(dist)
table = scraper.distribution(title='Dataset').as_pandas()
table

table.columns
Example #18
0
    def __init__(self, **kwargs):
        '''
        Base class for common scraping tasks

        Args:

        '''
        logging.getLogger(__name__).addHandler(logging.NullHandler())
        self.urls = []

        # use requests HTML to aid parsing
        # has all same methods as requests.Session
        _s = HTMLSession()

        # delay/expire
        if kwargs.get('delay'):
            self.delay = kwargs['delay']
        else:
            self.delay = 2

        if kwargs.get('expire_hours'):
            self.expire_hours = kwargs['expire_hours']
        else:
            self.expire_hours = 168

        # add cookies
        if kwargs.get('cookies'):
            _s.cookies = kwargs['cookies']
        else:
            try:
                import cookielib
                _s.cookies = cookielib.MozillaCookieJar()
            except (NameError, ImportError):
                import http.cookiejar
                _s.cookies = http.cookiejar.MozillaCookieJar()
                
        # add headers
        if kwargs.get('headers'):
            _s.headers = kwargs['headers']
        else:
            ua = ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36')
            _s.headers = {'User-Agent': ua}

        # add proxies
        if kwargs.get('proxies'):
            _s.proxies = kwargs['proxies']

        # add cache
        if not '/' in kwargs.get('cache_name', ''):
            self.cache_name = os.path.join('/tmp', kwargs['cache_name'])
        try:
            from cachecontrol import CacheControlAdapter
            from cachecontrol.heuristics import ExpiresAfter
            from cachecontrol.caches import FileCache
            _s.mount('http://', CacheControlAdapter(cache=FileCache(self.cache_name), 
                                    cache_etags = False,
                                    heuristic=ExpiresAfter(hours=self.expire_hours)))
        except ImportError as e:
            try:
                import requests_cache
                requests_cache.install_cache(self.cache_name)
            except:
                logging.exception('could not install cache')
        self.s = _s
import zipfile
from typing import Any, Dict, Optional

import aiohttp
import requests
from cachecontrol import CacheControl, CacheControlAdapter
from cachecontrol.caches.file_cache import FileCache
from cachecontrol.heuristics import ExpiresAfter

from shared import configuration, perf
from shared.pd_exception import OperationalException

SESSION = CacheControl(requests.Session(),
                       cache=FileCache(configuration.get('web_cache')))
SESSION.mount('http://whatsinstandard.com',
              CacheControlAdapter(heuristic=ExpiresAfter(days=14)))

AIOSESSION = aiohttp.ClientSession()


def unzip(url: str, path: str) -> str:
    location = '{scratch_dir}/zip'.format(
        scratch_dir=configuration.get('scratch_dir'))

    def remove_readonly(func, path, _):
        os.chmod(path, stat.S_IWRITE)
        func(path)

    shutil.rmtree(location, True, remove_readonly)
    os.mkdir(location)
    store(url, '{location}/zip.zip'.format(location=location))
Example #20
0
    def request(self,
                method,
                url,
                headers=None,
                params=None,
                cache=True,
                *args,
                **kwargs):
        url = self.normalize_url(url)
        kwargs.setdefault('params', {}).update(params or {})
        kwargs.setdefault('headers', {}).update(headers or {})

        # if method == 'POST':
        #    self.session.headers.update({"Content-type": "application/x-www-form-urlencoded"})
        kwargs.setdefault('headers',
                          {}).update({'Accept-Encoding': 'gzip, deflate'})
        kwargs.setdefault('headers', {}).update(random.choice(USER_AGENTS))

        # request session ssl verify
        kwargs['verify'] = False
        if sickrage.srCore.srConfig.SSL_VERIFY:
            try:
                kwargs['verify'] = certifi.where()
            except:
                pass

        # request session proxies
        if 'Referer' not in kwargs.get(
                'headers', {}) and sickrage.srCore.srConfig.PROXY_SETTING:
            sickrage.srCore.srLogger.debug(
                "Using global proxy: " +
                sickrage.srCore.srConfig.PROXY_SETTING)
            scheme, address = urllib2.splittype(
                sickrage.srCore.srConfig.PROXY_SETTING)
            address = \
                ('http://{}'.format(sickrage.srCore.srConfig.PROXY_SETTING), sickrage.srCore.srConfig.PROXY_SETTING)[
                    scheme]
            kwargs.setdefault('proxies', {}).update({
                "http": address,
                "https": address
            })
            kwargs.setdefault('headers', {}).update({'Referer': address})

        # setup session caching
        if cache:
            cache_file = os.path.abspath(
                os.path.join(sickrage.DATA_DIR, 'sessions.db'))
            cachecontrol.CacheControl(self,
                                      cache=DBCache(cache_file),
                                      heuristic=ExpiresAfter(days=7))

        # get web response
        response = super(srSession, self).request(method, url, *args,
                                                  **kwargs).result()

        try:
            # check web response for errors
            response.raise_for_status()
        except requests.exceptions.SSLError as e:
            if ssl.OPENSSL_VERSION_INFO < (1, 0, 1, 5):
                sickrage.srCore.srLogger.info(
                    "SSL Error requesting url: '{}' You have {}, try upgrading OpenSSL to 1.0.1e+"
                    .format(e.request.url, ssl.OPENSSL_VERSION))

            if sickrage.srCore.srConfig.SSL_VERIFY:
                sickrage.srCore.srLogger.info(
                    "SSL Error requesting url: '{}' Try disabling Cert Verification on the advanced tab of /config/general"
                    .format(e.request.url))
        except Exception as e:
            sickrage.srCore.srLogger.debug(e.message)

        return response
Example #21
0
import requests
from django.conf import settings
from django.http import HttpResponse, JsonResponse

from cachecontrol import CacheControl
from cachecontrol.heuristics import ExpiresAfter
from cachecontrol_django import DjangoCache
from django.views.decorators.gzip import gzip_page
from django.conf import settings

cached_sess = CacheControl(
    requests.session(),
    cache=DjangoCache(),
    heuristic=ExpiresAfter(seconds=30)
)


@gzip_page
def variomes_single_ref(request):
    # proxy requests to variomes server
    response = cached_sess.get('%s/fetchLit.jsp' % settings.VARIOMES_BASE_URL,
                               params=request.GET, verify=settings.VARIOMES_VERIFY_REQUESTS)

    if response.status_code != 200:
        return HttpResponse(response.content, status=response.status_code)

    return JsonResponse(response.json())


@gzip_page
def variomes_search(request):
Example #22
0
def main(group_id, location, time_boundary, event_status, pandoc):
    key_path = os.path.normpath(os.path.expanduser('~/.meetup.com-key'))
    if os.path.exists(key_path):
        with open(key_path) as fh:
            key = fh.read().strip()

    cache = FileCache('.web_cache', forever=True)
    requests = CacheControl(
        Session(), cache,
        cache_etags=False,
        heuristic=ExpiresAfter(days=1)
    )

    while True:
        resp = requests.get('https://api.meetup.com/status', params=dict(key=key))
        if resp.status_code == 200:
            break
        elif resp.status_code == 401:
            click.echo('Your meetup.com key is required. You can get it from https://secure.meetup.com/meetup_api/key/\n')

            if click.confirm('Open https://secure.meetup.com/meetup_api/key/ in your web browser?'):
                click.launch('https://secure.meetup.com/meetup_api/key/')

            click.echo('')
            key = click.prompt('Key', hide_input=True)
        else:
            click.fail('Failed to get meetup.com status. Response was {!r}'.format(resp.text))

    click.secho('For convenience your key is saved in `{}`.\n'.format(key_path), fg='magenta')
    with open(key_path, 'w') as fh:
        fh.write(key)

    while not location:
        location = location or get_input('Location: ', completer=WordCompleter(['cluj', 'iasi', 'timisoara'], ignore_case=True))

    while True:
        group_id = group_id or get_input('Group ID: ', completer=WordCompleter(['Cluj-py', 'RoPython-Timisoara'], ignore_case=True))

        resp = requests.get('https://api.meetup.com/2/events', params=dict(
            key=key,
            group_urlname=group_id,
            time=time_boundary,
            status=event_status,
        ))
        if resp.status_code == 200:
            json = resp.json()
            if json['results']:
                break
            else:
                click.secho('Invalid group `{}`. It has no events!'.format(group_id), fg='red')
                group_id = None
        if resp.status_code == '400':
            click.fail('Failed to get make correct request. Response was {!r}'.format(resp.text))
        else:
            click.secho('Invalid group `{}`. Response was [{}] {!r}'.format(group_id, resp.status_code, resp.text), fg='red')

    # click.echo(pformat(dict(resp.headers)))

    for event in json['results']:
        dt = datetime.fromtimestamp(event['time']/1000)
        click.echo("{}: {}".format(
            dt.strftime('%Y-%m-%d %H:%M:%S'),
            event['name']
        ))
        existing_path = glob(os.path.join('content', '*', dt.strftime('%Y-%m-%d*'), 'index.rst'))
        if existing_path:
            if len(existing_path) > 1:
                click.secho('\tERROR: multiple paths matched: {}'.format(existing_path))
            else:
                click.secho('\t`{}` already exists. Not importing.'.format(*existing_path), fg='yellow')
        else:
            target_dir = os.path.join('content', location, '{}-{}'.format(dt.strftime('%Y-%m-%d'), slugify(event['name'])))
            target_path = os.path.join(target_dir, 'index.rst')
            if not os.path.exists(target_dir):
                os.makedirs(target_dir)

            if pandoc:
                with tempfile.NamedTemporaryFile(delete=False) as fh:
                    fh.write(event['description'].encode('utf-8'))
                rst = subprocess.check_output(['pandoc', '--from=html', '--to=rst', fh.name]).decode('utf-8')
                print fh.name
                #os.unlink(fh.name)
            else:
                stream = StringIO()
                html2rest(event['description'].encode('utf-8'), writer=stream)
                rst = stream.getvalue().decode('utf-8')

            with io.open(target_path, 'w', encoding='utf-8') as fh:
                fh.write('''{name}
###############################################################

:tags: unknown
:registration:
    meetup.com: {event_url}

{rst}'''.format(rst=rst, **event))
            click.secho('\tWrote `{}`.'.format(target_path), fg='green')
Example #23
0
from operator import itemgetter

import requests
from django.conf import settings
from requests_futures.sessions import FuturesSession
from django.http import HttpResponse, JsonResponse

from cachecontrol import CacheControl
from cachecontrol.heuristics import ExpiresAfter
from cachecontrol_django import DjangoCache

from api.models import Gene

cached_sess = CacheControl(requests.session(),
                           cache=DjangoCache(),
                           heuristic=ExpiresAfter(days=10))

# -------------------------------------------------------------
# --- SOCIBP acquisition endpoints
# -------------------------------------------------------------


def get_genes(request):
    # params: projection=SUMMARY&pageSize=100000&pageNumber=0&direction=ASC
    response = requests.get(settings.SOCIBP_API_URL + '/genes',
                            params={
                                'projection': 'SUMMARY',
                                'pageSize': 100000,
                                'pageNumber': '0',
                                'direction': 'ASC'
                            },
import selenium.webdriver.support.ui as selenium_ui
from distutils.dir_util import copy_tree



# CHEF and CONTENT DEBUG
################################################################################
DEBUG_MODE = False                    # print extra-verbose info
DOWNLOAD_ONE_TO_webroot = False       # produce debug webroot/ and skip cheffing
DOWNLOAD_ONLY_N = False               # chef only first N books; set to False to disable



sess = requests.Session()
cache = FileCache('.webcache')
chefdev_adapter = CacheControlAdapter(heuristic=ExpiresAfter(days=1), cache=cache)
sess.mount('http://3asafeer.com/', chefdev_adapter)
sess.mount('http://fonts.googleapis.com/', chefdev_adapter)


headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0",
    "Accept-Encoding": "gzip, deflate",
    "Connection": "keep-alive"
}

# PAUSES and DELAYS
################################################################################
LOADING_WAIT_TIME = 10                                             # long delay
LOADING_WAIT_TIME_MS = LOADING_WAIT_TIME*1000
LOADING_WAIT_TIME_SHORT = 7                                        # short delay
Example #25
0
    def __init__(self, **kwargs):
        """
        """
        logging.getLogger(__name__).addHandler(logging.NullHandler())
        self.urls = []

        # use requests HTML to aid parsing
        # has all same methods as requests.Session
        _s = HTMLSession()
        self.delay = kwargs.get("delay", 2)
        self.expire_hours = kwargs.get("expire_hours", 168)

        # add cookies
        if kwargs.get("cookies"):
            _s.cookies = kwargs["cookies"]
        else:
            import http.cookiejar

            _s.cookies = http.cookiejar.MozillaCookieJar()

        # add headers
        default_headers = {
            "User-Agent": random.choice(USER_AGENTS),
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "en-US,en;q=0.9",
            "accept": "application/json, text/plain, */*",
        }
        _s.headers.update(default_headers)
        if kwargs.get("headers"):
            _s.headers.update(kwargs["headers"])

        # add proxies
        if kwargs.get("proxies"):
            _s.proxies = kwargs["proxies"]

        # add cache
        if not kwargs.get("cache_name"):
            self.cache_name = os.path.join("/tmp", random_string(32))
        elif "/" not in kwargs.get("cache_name", ""):
            self.cache_name = os.path.join("/tmp", kwargs["cache_name"])
        else:
            self.cache_name = kwargs.get("cache_name")

        try:
            from cachecontrol import CacheControlAdapter
            from cachecontrol.heuristics import ExpiresAfter
            from cachecontrol.caches import FileCache

            _s.mount(
                "http://",
                CacheControlAdapter(
                    cache=FileCache(self.cache_name),
                    cache_etags=False,
                    heuristic=ExpiresAfter(hours=self.expire_hours),
                ),
            )
        except ImportError:
            try:
                import requests_cache

                requests_cache.install_cache(self.cache_name)
            except BaseException:
                logging.exception("could not install cache")
        self.session = _s
Example #26
0
# # WG Notifications of deaths of residents related to COVID-19 in adult care homes 

from gssutils import * 
import json 
import numpy as np
import glob
from requests import Session
from cachecontrol import CacheControl
from cachecontrol.caches.file_cache import FileCache
from cachecontrol.heuristics import ExpiresAfter

scrape = Scraper(seed="info.json",
                 session=CacheControl(Session(), cache=FileCache('.cache'), heuristic=ExpiresAfter(days=1))
)
scraper = scrape
scraper

dist = scrape.distribution(
    latest=True,
    title=lambda x: x.startswith('Notifications of deaths of residents related to COVID-19')
)
dist

tabs = { tab.name: tab for tab in dist.as_databaker() }
list(tabs)

# +
#Check tab contents is what is expected before continuing. 

expected_tabs = ['Contents','Information','Table_1','Table_2','Table_3','Table_4','Table_5','Table_6','Table_7','Table_8', 'Table_9']
whats_missing = [item for item in tabs if item not in expected_tabs]
Example #27
0
import re
import requests

import discord
from discord.ext import commands
from discord.member import Member
from jikanpy import Jikan
from cachecontrol import CacheControl
from cachecontrol.heuristics import ExpiresAfter
from cachecontrol.caches.file_cache import FileCache

import config

expires = ExpiresAfter(days=1)
session = CacheControl(requests.Session(),
                       heuristic=expires,
                       cache=FileCache(config.cache_dir))
jikan = Jikan(session=session)


class JoinableMessage:
    def __init__(self, message: discord.message, bot):
        self.message = message
        self.bot = bot

    def is_joinable(self):
        if self.message.author.id != self.bot.user.id:
            return False
        if len(self.message.embeds) == 0:
            return False
        if self.get_field('channel') is None:
 def setup(self):
     self.sess = Session()
     self.cache_sess = CacheControl(self.sess,
                                    heuristic=ExpiresAfter(days=1))
Example #29
0
TIMEOUT_SECONDS = 10  # Seconds before URL query timeout is raised

PROVIDERS_URLS = [
    "https://providers.optimade.org/v1/links",
    "https://raw.githubusercontent.com/Materials-Consortia/providers/master/src"
    "/links/v1/providers.json",
]

CACHE_DIR = Path(appdirs.user_cache_dir("optimade-client", "CasperWA"))
CACHE_DIR.mkdir(parents=True, exist_ok=True)
CACHED_PROVIDERS = CACHE_DIR / "cached_providers.json"

SESSION = requests.Session()
SESSION_ADAPTER = CacheControlAdapter(
    cache=FileCache(CACHE_DIR / ".requests_cache"), heuristic=ExpiresAfter(days=1)
)
SESSION_ADAPTER_DEBUG = CacheControlAdapter()
SESSION.mount("http://", SESSION_ADAPTER)
SESSION.mount("https://", SESSION_ADAPTER)
SESSION.mount("http://localhost", SESSION_ADAPTER_DEBUG)
SESSION.mount("http://127.0.0.1", SESSION_ADAPTER_DEBUG)

# Currently known providers' development OPTIMADE base URLs
DEVELOPMENT_PROVIDERS = {"mcloud": "https://dev-www.materialscloud.org/optimade"}

try:
    DEVELOPMENT_MODE = bool(int(os.getenv("OPTIMADE_CLIENT_DEVELOPMENT_MODE", "0")))
except ValueError:
    LOGGER.debug(
        (
Example #30
0
def main(group_id, location, time_boundary, event_status, pandoc, force):
    key_path = os.path.normpath(os.path.expanduser('~/.meetup.com-key'))
    if os.path.exists(key_path):
        with io.open(key_path, encoding='utf8') as fh:
            key = fh.read().strip()
    else:
        key = None
    cache = FileCache('.web_cache', forever=True)
    requests = CacheControl(Session(),
                            cache,
                            cache_etags=False,
                            heuristic=ExpiresAfter(days=1))

    while True:
        resp = requests.get('https://api.meetup.com/status',
                            params=dict(key=key))
        if resp.status_code == 200 and resp.json().get('status') == 'ok':
            break
        elif resp.status_code == 200 and any(
                'auth_fail' == e.code for e in resp.json().get('errors', [])):
            click.echo(
                'Your meetup.com key is required. You can get it from https://secure.meetup.com/meetup_api/key/\n'
            )

            if click.confirm(
                    'Open https://secure.meetup.com/meetup_api/key/ in your web browser?'
            ):
                click.launch('https://secure.meetup.com/meetup_api/key/')

            click.echo('')
            key = click.prompt('Key', hide_input=True)
        else:
            raise click.ClickException(
                'Failed to get meetup.com status. Response was {!r} {!r}'.
                format(resp.status_code, resp.text))

    click.secho(
        'For convenience your key is saved in `{}`.\n'.format(key_path),
        fg='magenta')
    with open(key_path, 'w') as fh:
        fh.write(key)

    while not location:
        location = location or get_input(
            u'Location: ',
            completer=WordCompleter(
                [u'cluj', u'iasi', u'timisoara', u'bucuresti'],
                ignore_case=True))

    while True:
        group_id = group_id or get_input(
            u'Group ID: ',
            completer=WordCompleter([
                u'RoPython-Bucuresti', u'RoPython-Cluj', u'RoPython_Iasi',
                u'RoPython-Timisoara'
            ],
                                    ignore_case=True))

        resp = requests.get('https://api.meetup.com/2/events',
                            params=dict(
                                key=key,
                                group_urlname=group_id,
                                time=time_boundary,
                                status=event_status,
                            ))
        if resp.status_code == 200:
            json = resp.json()
            if json['results']:
                break
            else:
                click.secho(
                    'Invalid group `{}`. It has no events!'.format(group_id),
                    fg='red')
                group_id = None
        if resp.status_code == '400':
            click.fail(
                'Failed to get make correct request. Response was {!r}'.format(
                    resp.text))
        else:
            click.secho('Invalid group `{}`. Response was [{}] {!r}'.format(
                group_id, resp.status_code, resp.text),
                        fg='red')

    # click.echo(pformat(dict(resp.headers)))

    for event in json['results']:
        dt = datetime.fromtimestamp(event['time'] / 1000)
        event['duration'] = format_duration(
            event.get('duration', 3600000) / 1000)
        event['time'] = dt.strftime('%Y-%m-%d %H:%M')
        if 'how_to_find_us' in event:
            address = event['how_to_find_us'],
        else:
            address = ()
        if 'venue' in event:
            address_1 = event['venue'].get('address_1')
            if address_1:
                address += address_1,
            event['venue']['address_1'] = ', '.join(address)
        else:
            event['venue'] = {'address_1': address}
        click.echo("{time}: {name}".format(**event))
        click.echo("\t{}".format(pformat(event)))
        existing_path = glob(
            os.path.join('content', '*', dt.strftime('%Y-%m-%d*'),
                         'index.rst'))
        if existing_path and not force:
            if len(existing_path) > 1:
                click.secho('\tERROR: multiple paths matched: {}'.format(
                    existing_path))
            else:
                click.secho('\t`{}` already exists. Not importing.'.format(
                    *existing_path),
                            fg='yellow')
        else:
            target_dir = os.path.join(
                'content', location, '{}-{}'.format(dt.strftime('%Y-%m-%d'),
                                                    slugify(event['name'])))
            target_path = os.path.join(target_dir, 'index.rst')
            if not os.path.exists(target_dir):
                os.makedirs(target_dir)

            if pandoc:
                with tempfile.NamedTemporaryFile(delete=False) as fh:
                    fh.write(event['description'].encode('utf-8'))
                rst = subprocess.check_output(
                    ['pandoc', '--from=html', '--to=rst',
                     fh.name]).decode('utf-8')
                os.unlink(fh.name)
            else:
                rst = html2rest(event['description'])

            doc = u'''{name}
###############################################################

:tags: prezentari
:registration:
    meetup.com: {event_url}
:start: {time}
:duration: {duration}
:location: {venue[address_1]}, {venue[city]}, {venue[localized_country_name]}

{rst}'''.format(rst=rst, **event)
            with io.open(target_path, 'w', encoding='utf-8') as fh:
                fh.write(doc)
            click.secho('\tWrote `{}`.'.format(target_path), fg='green')