def clear_cache(self):
     """
     Clear the cache database.
     """
     msg = 'Cached Data has been cleared'
     requests_cache.get_cache().clear()
     xbmcgui.Dialog().notification(_plugin, msg, _icon, 3000, False)
def cache_initiated():
    """Hackish function to test if there is an existing requests_cache"""
    try:
        requests_cache.get_cache()
        return True
    except AttributeError:
        return False
Example #3
0
 def uninstall_cache():
     """Deactives caches"""
     try:
         requests_cache.get_cache()
     except AttributeError:
         pass
     else:
         requests_cache.uninstall_cache()
Example #4
0
    def get_one_page(self, url, usecache=True):
        if not usecache:
            if requests_cache.get_cache().has_url(url):
                requests_cache.get_cache().delete_url(url)

        print "# fetching: %s" % url
        i = requests.get(url, auth=(self.username, self.password))
        #print "# fetched: %s" % url
        return i
Example #5
0
    def get_comments(self, usecache=True):
        for k in self.datadict.keys():
            if 'comments_url' in self.datadict[k]:

                if not usecache:
                    if requests_cache.get_cache().has_url(self.datadict[k]['comments_url']):
                        requests_cache.get_cache().delete_url(self.datadict[k]['comments_url'])

                i = self.get_one_page(self.datadict[k]['comments_url'])
                idict = json.loads(i.content)
                self.datadict[k]['comments'] = idict
Example #6
0
def has_url(url):
    try:
        cache = requests_cache.get_cache()
    except AttributeError:
        return False

    return cache.has_url(url)
Example #7
0
def load_annotations(data_folder):
    contents = os.listdir(path=data_folder)
    try:
        infile = [i for i in contents if '.tsv' in i][0]
    except IndexError:
        raise Exception(f"No .tsv found in {contents}")
    
    with open(infile, 'r') as litcovid_tsv:
        tsv_reader = csv.reader(litcovid_tsv, delimiter='\t')
        for i in range(32):
            next(tsv_reader)
        pmids = [line[0] for line in tsv_reader]

    doc_id_set = set()
    requests_cache.install_cache('litcovid_cache')
    requests_cache.clear()
    s = requests_cache.CachedSession()
    s.hooks = {'response': throttle}
    logging.debug("requests_cache: %s", requests_cache.get_cache().responses.filename)
    for i, pmid in enumerate(pmids,start=1):
        # NCBI eutils API limits requests to 10/sec
        if i % 100 == 0:
            logging.info("litcovid.parser.load_annotations progress %s", i)

        doc = getPubMedDataFor(pmid, session=s)
        if doc['_id'] not in doc_id_set:
            yield doc
        doc_id_set.add(doc['_id'])

    remove_expired(s)
def get_timestamp(url):
    """
    get the timestamp of an HTTP get request
    :param url: the URL of the request
    :return the timestamp of the request, of None if the request is not in the cache
    """
    def _to_bytes(s, encoding='utf-8'):
        return bytes(s, encoding)

    def create_key(request):
        url, body = request.url, request.body
        key = hashlib.sha256()
        key.update(_to_bytes(request.method.upper()))
        key.update(_to_bytes(url))
        if request.body:
            key.update(_to_bytes(body))
        return key.hexdigest()

    def url_to_key(url):
        session = requests.Session()
        return create_key(session.prepare_request(requests.Request('GET', url)))

    #   get the cache from request_cache
    results = requests_cache.get_cache()
    #   create the key according to the url
    key_url = url_to_key(url)
    #   results.responses is a dictionary and follows the following format:
    #   { 'key': (requests_cache.backends objects, timestamp), ..., }
    #   for example: '4c28e3e4a61e325e520d9c02e0caee99e30c00951a223e67':
    #                       (<requests_cache.backends.base._Store object at 0x12697e630>,
    #                           datetime.datetime(2018, 10, 16, 0, 19, 8, 130204)),
    if key_url in results.responses:
        back_obj, timestamp = results.responses[key_url]
        return timestamp
    return None
Example #9
0
 def __init__(self,
              cache_name="cache",
              expire_after=300,
              http_get_timeout=5,
              delay_between_requests=8,
              headers=HEADERS):
     self.http_get_timeout = http_get_timeout
     self.delay_between_requests = delay_between_requests
     self.headers = headers
     self.expire_after = expire_after
     self.cache_name = cache_name
     # Install cache
     requests_cache.install_cache(self.cache_name,
                                  backend='sqlite',
                                  expire_after=self.expire_after)
     self._cache = requests_cache.get_cache()
     # Establish a session to be used for the GET requests
     # IMPORTANT: the session must be established after installing the cache
     # if you want all your responses to be cached, i.e. monkey-patching
     # requests. Ref.: https://bit.ly/2MCDCeD
     self._req_session = requests.Session()
     # Add headers to the request session
     self._req_session.headers = self.headers
     self._last_request_time = -sys.float_info.max
     self.response = None
Example #10
0
def load_annotations(data_folder):

    infile = os.path.join(data_folder, "litcovid2BioCJSON.gz")
    assert os.path.exists(infile)

    with open_anyfile(infile, mode='r') as file:
        a = file.read()
        data_list = json.loads(a)
        # First item is a comment by provider
        data = data_list[1]

    doc_id_set = set()
    with requests_cache.enabled('litcovid_cache', expire_after=expire_after):
        logging.debug("requests_cache: %s",
                      requests_cache.get_cache().responses.filename)
        for i, rec in enumerate(data, start=1):
            # NCBI eutils API limits requests to 10/sec
            if i % 100 == 0:
                logging.info("litcovid.parser.load_annotations progress %s", i)

            doc = getPubMedDataFor(rec["pmid"])
            if not doc['from_cache']:
                time.sleep(.2)
            doc.pop('from_cache')
            if doc['_id'] not in doc_id_set:
                yield doc
            doc_id_set.add(doc['_id'])
        requests_cache.core.remove_expired_responses()
Example #11
0
def weather():
    lat = request.form['lat']
    lon = request.form['lon']
    t_threshold = request.form['t_threshold']
    w_threshold = request.form['w_threshold']
    metric = request.form['metric']
    r = requests.get(
        'https://api.darksky.net/forecast/25526a9ace577c46efe91103749d39ed/' +
        lat + ',' + lon + '?units=' + metric +
        '&exclude=minutely,flags,hourly')

    old_stdout = sys.stdout
    log_file = open("cache.log", "w")
    sys.stdout = log_file

    print('Cache enabled for this request:', r.from_cache)
    print('Request:', requests_cache.get_cache())

    sys.stdout = old_stdout
    log_file.close()

    json = r.json()

    epoch = json['currently']['time']
    currenttime = time.strftime("%a, %d %b %Y %H:%M:%S %Z",
                                time.localtime(epoch))

    return render_template('newweather.html',
                           json=json,
                           currenttime=currenttime,
                           t_threshold=t_threshold,
                           w_threshold=w_threshold)
Example #12
0
 def install(self):
     """install cache"""
     requests  # require `import requests` in case this is essential for monkey patching by requests_cache.
     requests_cache.install_cache(self.path, include_get_headers=True)
     self.cache = requests_cache.get_cache()
     logging.info(
         f"requests-cache starting with {len(self.cache.responses)} cached responses"
     )
Example #13
0
 def _wait_for_limiting(self):
     # https://api.github.com/users/whatever
     url = 'https://api.github.com/users/' + self.username
     sleeptime = True
     while sleeptime:
         if requests_cache.get_cache().has_url(url):
             requests_cache.get_cache().delete_url(url)
         i = requests.get(url, auth=(self.username, self.password))
         sleeptime = i.headers.get('X-RateLimit-Reset', None)
         if sleeptime:
             sleeptime = calendar.timegm(time.gmtime()) - int(sleeptime)
             if sleeptime < 0:
                 sleeptime = sleeptime * -1
             n_time = time.time()
             n_time = datetime.fromtimestamp(n_time).strftime('%Y-%m-%d $H:%M:%S')
             print "# %s sleeping %s" % (n_time, sleeptime)
             time.sleep(sleeptime)
Example #14
0
    def setup(self):
        ds_args = dict(id=self.name)
        defaults = dict(name='HelMet-kirjastot')
        self.data_source, _ = DataSource.objects.get_or_create(
            defaults=defaults, **ds_args)
        self.tprek_data_source = DataSource.objects.get(id='tprek')
        self.ahjo_data_source = DataSource.objects.get(id='ahjo')
        system_data_source_defaults = {'user_editable': True}
        self.system_data_source, _ = DataSource.objects.get_or_create(id=settings.SYSTEM_DATA_SOURCE_ID,
                                                                      defaults=system_data_source_defaults)

        org_args = dict(origin_id='u4804001010', data_source=self.ahjo_data_source)
        defaults = dict(name='Helsingin kaupunginkirjasto')
        self.organization, _ = Organization.objects.get_or_create(defaults=defaults, **org_args)
        org_args = dict(origin_id='00001', data_source=self.ahjo_data_source)
        defaults = dict(name='Helsingin kaupunki')
        self.city, _ = Organization.objects.get_or_create(defaults=defaults, **org_args)

        # Build a cached list of Places
        loc_id_list = [l[1] for l in LOCATIONS.values()]
        place_list = Place.objects.filter(
            data_source=self.tprek_data_source
        ).filter(origin_id__in=loc_id_list)
        self.tprek_by_id = {p.origin_id: p.id for p in place_list}

        # Create "Tapahtuma vain internetissä" location if not present
        defaults = dict(data_source=self.system_data_source,
                        publisher=self.city,
                        name='Internet',
                        description='Tapahtuma vain internetissä.',)
        self.internet_location, _ = Place.objects.get_or_create(id=INTERNET_LOCATION_ID, defaults=defaults)

        try:
            yso_data_source = DataSource.objects.get(id='yso')
        except DataSource.DoesNotExist:
            yso_data_source = None

        if yso_data_source:
            # Build a cached list of YSO keywords
            cat_id_set = set()
            for yso_val in YSO_KEYWORD_MAPS.values():
                if isinstance(yso_val, tuple):
                    for t_v in yso_val:
                        cat_id_set.add('yso:' + t_v)
                else:
                    cat_id_set.add('yso:' + yso_val)

            keyword_list = Keyword.objects.filter(data_source=yso_data_source).\
                filter(id__in=cat_id_set)
            self.yso_by_id = {p.id: p for p in keyword_list}
        else:
            self.yso_by_id = {}

        if self.options['cached']:
            requests_cache.install_cache('helmet')
            self.cache = requests_cache.get_cache()
        else:
            self.cache = None
Example #15
0
    def get_one_page(self, url, usecache=True, ignoreerrors=True):

        limited = True

        while limited:
            if not usecache:
                if requests_cache.get_cache().has_url(url):
                    requests_cache.get_cache().delete_url(url)

            if self.username and self.password:
                print "# fetching (BASIC): %s" % url
                i = requests.get(url, auth=(self.username, self.password))
            elif self.token:
                print "# fetching (TOKEN): %s" % url
                i = requests.get(url, headers={'Authorization': "token %s" % self.token})
            else:
                print "# fetching (NONE): %s" % url
                i = requests.get(url)

            if not i.ok:
                print "# ERROR: %s for %s " % (i.reason, url)
                #import epdb; epdb.st()
                if 'rate limit exceeded' in i.content:
                    self._wait_for_limiting()
                elif i.reason == 'Not Found':
                    limited = False
                else:
                    import epdb; epdb.st()
                    sys.exit(1)
            else:
                data = None
                try:
                    data = json.loads(i.content)
                except:
                    pass
                if data:
                    if 'documentation_url' in data:
                        limited = True
                        print "# hit rate limit, sleeping 300s"
                        time.sleep(300)
                    else:
                        limited = False
                else:
                    limited = False
        return i        
Example #16
0
    def run(self):
        def _parse_date(str):
            if str is None:
                return datetime.new()
            return datetime(*parsedate(str)[:6])

        self.start_time = clock()
        try:
            requests_cache.install_cache('.cache')
            if not self.enable_cache:
                log.debug("removing '%s' from cache" % self.url)
                requests_cache.get_cache().delete_url(self.url)

            log.debug("fetching '%s'" % self.url)

            if self.url.startswith('file://'):
                path = self.url[7:]
                if not os.path.exists(path):
                    raise IOError("file not found: %s" % path)

                with open(path, 'r') as fd:
                    self.result = fd.read()
                    self.cached = False
                    self.date = datetime.now()
                    self.last_modified = datetime.fromtimestamp(
                        os.stat(path).st_mtime)
            else:
                self.resp = requests.get(self.url, timeout=60, verify=False)
                self.last_modified = _parse_date(
                    self.resp.headers.get('last-modified',
                                          self.resp.headers.get('date', None)))
                self.date = _parse_date(self.resp.headers['date'])
                self.cached = getattr(self.resp, 'from_cache', False)
                self.status = self.resp.status_code
                if self.resp.status_code != 200:
                    raise IOError(self.resp.reason)
                self.result = self.resp.content

            log.debug("got %d bytes from '%s'" % (len(self.result), self.url))
        except Exception, ex:
            traceback.print_exc()
            log.warn("unable to fetch '%s': %s" % (self.url, ex))
            self.ex = ex
            self.result = None
Example #17
0
def main():
    # The real request will only be made once; afterward, the cached response is used
    for i in range(5):
        response = requests.get('http://httpbin.org/get')

    # This is more obvious when calling a slow endpoint
    for i in range(5):
        response = requests.get('http://httpbin.org/delay/2')

    # Caching can be disabled if we want to get a fresh page and not cache it
    with requests_cache.disabled():
        print(requests.get('http://httpbin.org/ip').text)

    # Get some debugging info about the cache
    print(requests_cache.get_cache())
    print('Cached URLS:', requests_cache.get_cache().urls)

    # Uninstall to remove caching from all requests functions
    requests_cache.uninstall_cache()
Example #18
0
File: utils.py Project: GEANT/met
    def run(self):

        def _parse_date(str):
            if str is None:
                return datetime.new()
            return datetime(*parsedate(str)[:6])

        self.start_time = clock()
        try:
            requests_cache.install_cache('.cache')
            if not self.enable_cache:
                log.debug("removing '%s' from cache" % self.url)
                requests_cache.get_cache().delete_url(self.url)

            log.debug("fetching '%s'" % self.url)

            if self.url.startswith('file://'):
                path = self.url[7:]
                if not os.path.exists(path):
                    raise IOError("file not found: %s" % path)

                with open(path, 'r') as fd:
                    self.result = fd.read()
                    self.cached = False
                    self.date = datetime.now()
                    self.last_modified = datetime.fromtimestamp(os.stat(path).st_mtime)
            else:
                self.resp = requests.get(self.url, timeout=60, verify=False)
                self.last_modified = _parse_date(self.resp.headers.get('last-modified', self.resp.headers.get('date', None)))
                self.date = _parse_date(self.resp.headers['date'])
                self.cached = getattr(self.resp, 'from_cache', False)
                self.status = self.resp.status_code
                if self.resp.status_code != 200:
                    raise IOError(self.resp.reason)
                self.result = self.resp.content

            log.debug("got %d bytes from '%s'" % (len(self.result), self.url))
        except Exception, ex:
            traceback.print_exc()
            log.warn("unable to fetch '%s': %s" % (self.url, ex))
            self.ex = ex
            self.result = None
def _url_in_cache(url):
    """
    If requests_cache is in use, return whether or not the URL is in the cache.
    If not, return False.
    """
    try:
        return requests_cache.get_cache().has_url(url)
    except AttributeError as e:  # requests_cache not enabled
        if e.message == "'Session' object has no attribute 'cache'":
            return False
        raise
Example #20
0
def generate_csl_items(args, citekeys_df):
    """
    General CSL (citeproc) items for standard_citekeys in citekeys_df.
    Writes references.json to disk and logs warnings for potential problems.
    """
    # Read manual references (overrides) in JSON CSL
    manual_refs = load_manual_references(args.manual_references_paths)

    requests_cache.install_cache(args.requests_cache_path,
                                 include_get_headers=True)
    cache = requests_cache.get_cache()
    if args.clear_requests_cache:
        logging.info('Clearing requests-cache')
        requests_cache.clear()
    logging.info(
        f'requests-cache starting with {len(cache.responses)} cached responses'
    )

    csl_items = list()
    failures = list()
    for standard_citekey in citekeys_df.standard_citekey.unique():
        if standard_citekey in manual_refs:
            csl_items.append(manual_refs[standard_citekey])
            continue
        elif standard_citekey.startswith('raw:'):
            logging.error(
                f'CSL JSON Data with a standard_citekey of {standard_citekey!r} not found in manual-references.json. '
                'Metadata must be provided for raw citekeys.')
            failures.append(standard_citekey)
        try:
            csl_item = citekey_to_csl_item(standard_citekey)
            csl_items.append(csl_item)
        except Exception:
            logging.exception(
                f'Citeproc retrieval failure for {standard_citekey!r}')
            failures.append(standard_citekey)

    logging.info(
        f'requests-cache finished with {len(cache.responses)} cached responses'
    )
    requests_cache.uninstall_cache()

    if failures:
        message = 'CSL JSON Data retrieval failed for the following standardized citation keys:\n{}'.format(
            '\n'.join(failures))
        logging.error(message)

    # Write JSON CSL bibliography for Pandoc.
    with args.references_path.open('w', encoding='utf-8') as write_file:
        json.dump(csl_items, write_file, indent=2, ensure_ascii=False)
        write_file.write('\n')
    return csl_items
Example #21
0
def main():
    # Once cached, delayed page will be taken from cache
    # redirects also handled
    for i in range(5):
        requests.get('http://httpbin.org/delay/2')
        r = requests.get('http://httpbin.org/redirect/5')
        print(r.text)

    # And if we need to get fresh page or don't want to cache it?
    with requests_cache.disabled():
        print(requests.get('http://httpbin.org/ip').text)

    # Debugging info about cache
    print(requests_cache.get_cache())
Example #22
0
def main():
    # Once cached, delayed page will be taken from cache
    # redirects also handled
    for i in range(5):
        requests.get('http://httpbin.org/delay/2')
        r = requests.get('http://httpbin.org/redirect/5')
        print(r.text)

    # And if we need to get fresh page or don't want to cache it?
    with requests_cache.disabled():
        print(requests.get('http://httpbin.org/ip').text)

    # Debugging info about cache
    print(requests_cache.get_cache())
Example #23
0
    def setup(self):
        ds_args = dict(id=self.name)
        defaults = dict(name='HelMet-kirjastot')
        self.data_source, _ = DataSource.objects.get_or_create(
            defaults=defaults, **ds_args)
        self.tprek_data_source = DataSource.objects.get(id='tprek')

        ahjo_ds, _ = DataSource.objects.get_or_create(defaults=defaults,
                                                      **ds_args)

        org_args = dict(id='ahjo:45400')
        defaults = dict(name='Helsingin kaupunginkirjasto',
                        data_source=ahjo_ds)
        self.organization, _ = Organization.objects.get_or_create(
            defaults=defaults, **org_args)

        # Build a cached list of Places
        loc_id_list = [l[1] for l in LOCATIONS.values()]
        place_list = Place.objects.filter(
            data_source=self.tprek_data_source).filter(
                origin_id__in=loc_id_list)
        self.tprek_by_id = {p.origin_id: p.id for p in place_list}

        try:
            yso_data_source = DataSource.objects.get(id='yso')
        except DataSource.DoesNotExist:
            yso_data_source = None

        if yso_data_source:
            # Build a cached list of YSO keywords
            cat_id_set = set()
            for yso_val in YSO_KEYWORD_MAPS.values():
                if isinstance(yso_val, tuple):
                    for t_v in yso_val:
                        cat_id_set.add('yso:' + t_v)
                else:
                    cat_id_set.add('yso:' + yso_val)

            keyword_list = Keyword.objects.filter(data_source=yso_data_source).\
                    filter(id__in=cat_id_set)
            self.yso_by_id = {p.id: p for p in keyword_list}
        else:
            self.yso_by_id = {}

        if self.options['cached']:
            requests_cache.install_cache('helmet')
            self.cache = requests_cache.get_cache()
        else:
            self.cache = None
Example #24
0
def _is_response_cached(method, full_url):
  """Returns True if response to GET request is in requests_cache.

  Args:
    method (str): http verb ('GET', 'POST', etc.)
    full_url (str): url, including the protocol
  Returns:
    is_cached (bool):
  """
  if method != 'GET':
    return False # pragma: no cover
  try:
    cache = requests_cache.get_cache()
  except AttributeError: # pragma: no cover
    cache = None
  return cache.has_url(full_url) if cache else False
Example #25
0
def _is_response_cached(method, full_url):  # pragma: no cover
    """Returns True if response to GET request is in requests_cache.

  Args:
    method (str): http verb ('GET', 'POST', etc.)
    full_url (str): url, including the protocol
  Returns:
    is_cached (bool):
"""
    if method != 'GET':
        return False
    try:
        cache = requests_cache.get_cache()
    except AttributeError:
        cache = None
    return cache.has_url(full_url) if cache else False
Example #26
0
def generate_csl_items(args, citation_df):
    """
    General CSL (citeproc) items for standard_citations in citation_df.
    Writes references.json to disk and logs warnings for potential problems.
    """
    # Read manual references (overrides) in JSON CSL
    manual_refs = read_manual_references(args.manual_references_path)

    requests_cache.install_cache(args.requests_cache_path,
                                 include_get_headers=True)
    cache = requests_cache.get_cache()
    if args.clear_requests_cache:
        logging.info('Clearing requests-cache')
        requests_cache.clear()
    logging.info(
        f'requests-cache starting with {len(cache.responses)} cached responses'
    )

    csl_items = list()
    failures = list()
    for citation in citation_df.standard_citation.unique():
        if citation in manual_refs:
            csl_items.append(manual_refs[citation])
            continue
        try:
            citeproc = citation_to_citeproc(citation)
            csl_items.append(citeproc)
        except Exception as error:
            logging.exception(f'Citeproc retrieval failure for {citation}')
            failures.append(citation)

    logging.info(
        f'requests-cache finished with {len(cache.responses)} cached responses'
    )
    requests_cache.uninstall_cache()

    if failures:
        message = 'Citeproc retrieval failed for:\n{}'.format(
            '\n'.join(failures))
        logging.error(message)

    # Write JSON CSL bibliography for Pandoc.
    with args.references_path.open('w') as write_file:
        json.dump(csl_items, write_file, indent=2, ensure_ascii=False)
        write_file.write('\n')
    return csl_items
def call_api(url, doi):
    req = requests.Request('GET', url + doi)

    cache = requests_cache.get_cache()

    prepped = requests.Session().prepare_request(req)
    cache_key = cache.create_key(prepped)

    try:
        response = cache.get_response(cache_key)
    except (ImportError, TypeError):
        response = None

    if response:
        return response.json()

    return call_api_server(url, doi)
Example #28
0
    def setup(self):
        ds_args = dict(id=self.name)
        defaults = dict(name='HelMet-kirjastot')
        self.data_source, _ = DataSource.objects.get_or_create(
            defaults=defaults, **ds_args)
        self.tprek_data_source = DataSource.objects.get(id='tprek')

        ahjo_ds, _ = DataSource.objects.get_or_create(defaults=defaults, **ds_args)

        org_args = dict(id='ahjo:45400')
        defaults = dict(name='Helsingin kaupunginkirjasto', data_source=ahjo_ds)
        self.organization, _ = Organization.objects.get_or_create(defaults=defaults, **org_args)

        # Build a cached list of Places
        loc_id_list = [l[1] for l in LOCATIONS.values()]
        place_list = Place.objects.filter(
            data_source=self.tprek_data_source
        ).filter(origin_id__in=loc_id_list)
        self.tprek_by_id = {p.origin_id: p.id for p in place_list}

        try:
            yso_data_source = DataSource.objects.get(id='yso')
        except DataSource.DoesNotExist:
            yso_data_source = None

        if yso_data_source:
            # Build a cached list of YSO keywords
            cat_id_set = set()
            for yso_val in YSO_KEYWORD_MAPS.values():
                if isinstance(yso_val, tuple):
                    for t_v in yso_val:
                        cat_id_set.add('yso:' + t_v)
                else:
                    cat_id_set.add('yso:' + yso_val)

            keyword_list = Keyword.objects.filter(data_source=yso_data_source).\
                    filter(id__in=cat_id_set)
            self.yso_by_id = {p.id: p for p in keyword_list}
        else:
            self.yso_by_id = {}

        if self.options['cached']:
            requests_cache.install_cache('helmet')
            self.cache = requests_cache.get_cache()
        else:
            self.cache = None
Example #29
0
    def get_cached_ids(self):
        with requests_cache.enabled(cache_name=GENAPI_CACHE,
                                    backend=CACHE_BACKEND):
            cached_object = requests_cache.get_cache()
            responses = [
                cached_object.get_response(response)
                for response in cached_object.responses
            ]
            gen_ids = []

            for url in [response.url for response in responses]:
                gen_id = re.search(r'{}(.*?)/'.format(self._data_endpoint),
                                   url)

                if gen_id is not None:
                    gen_ids.append(gen_id.group(1))

            return gen_ids
Example #30
0
    def setup(self):
        self.tprek_data_source = DataSource.objects.get(id='tprek')

        ds_args = dict(id=self.name)
        ds_defaults = dict(name='City of Espoo')
        self.data_source, _ = DataSource.objects.get_or_create(defaults=ds_defaults, **ds_args)

        org_args = dict(origin_id='kaupunki', data_source=self.data_source)
        org_defaults = dict(name='Espoon kaupunki')
        self.organization, _ = Organization.objects.get_or_create(defaults=org_defaults, **org_args)
        self._build_cache_places()
        self._cache_yso_keywords()

        if self.options['cached']:
            requests_cache.install_cache('espoo')
            self.cache = requests_cache.get_cache()
        else:
            self.cache = None
Example #31
0
    def setup(self):
        self.tprek_data_source = DataSource.objects.get(id='tprek')

        ds_args = dict(id=self.name)
        ds_defaults = dict(name='City of Espoo')
        self.data_source, _ = DataSource.objects.get_or_create(defaults=ds_defaults, **ds_args)

        org_args = dict(id='espoo:kaupunki')
        org_defaults = dict(name='Espoon kaupunki', data_source=self.data_source)
        self.organization, _ = Organization.objects.get_or_create(defaults=org_defaults, **org_args)
        self._build_cache_places()
        self._cache_yso_keywords()

        if self.options['cached']:
            requests_cache.install_cache('espoo')
            self.cache = requests_cache.get_cache()
        else:
            self.cache = None
def _is_url_in_cache(*args, **kwargs):
    """ Return True if request has been cached or False otherwise. """
    # Only include allowed arguments for a PreparedRequest.
    allowed_args = inspect.getargspec(
        requests.models.PreparedRequest.prepare).args
    # self is in there as .prepare() is a method.
    allowed_args.remove('self')

    kwargs_cleaned = {}
    for key, value in dict(kwargs).items():
        if key in allowed_args:
            kwargs_cleaned[key] = value

    prepared_request = _prepare(*args, **kwargs_cleaned)
    request_hash = _get_hash(prepared_request)
    try:
        return requests_cache.get_cache().has_key(request_hash)
    except AttributeError as e:  # requests_cache not enabled
        if str(e) == "'Session' object has no attribute 'cache'":
            return False
        raise
def main(argv):
    parser = build_cli_parser()
    opts, args = parser.parse_args(argv)
    if not opts.url or not opts.token:
        parser.print_help()
        sys.exit(-1)

    global cache_file_name
    cache_file_name = opts.cache_name
    requests_cache.install_cache(cache_file_name,
                                 allowable_methods=('GET', 'POST'))

    global cbserverurl
    cbserverurl = opts.url

    #
    # build a cbapi object
    #
    global cb
    cb = cbapi.CbApi(opts.url, token=opts.token, ssl_verify=opts.ssl_verify)

    #
    # Run Tests to get cached responses
    #
    large_process_search()

    large_binary_search()

    test_sensors()

    test_watchlist()

    test_base_endpoints()

    test_feeds()

    cache = requests_cache.get_cache()
def main(argv):
    parser = build_cli_parser()
    opts, args = parser.parse_args(argv)
    if not opts.url or not opts.token:
        parser.print_help()
        sys.exit(-1)

    global cache_file_name
    cache_file_name = opts.cache_name
    requests_cache.install_cache(cache_file_name, allowable_methods=('GET', 'POST'))

    global cbserverurl
    cbserverurl = opts.url

    #
    # build a cbapi object
    #
    global cb
    cb = cbapi.CbApi(opts.url, token=opts.token, ssl_verify=opts.ssl_verify)

    #
    # Run Tests to get cached responses
    #
    large_process_search()

    large_binary_search()

    test_sensors()

    test_watchlist()

    test_base_endpoints()

    test_feeds()

    cache = requests_cache.get_cache()
Example #35
0
def extract_datareader(tickers, data_source="av-daily-adjusted", pause=None):
    """
    Retrieve daily data with web.DataReader.

    Parameters
    ----------
    tickers : str or list of strs
        Ticker or tickers to extract
    data_source : str, optional
        The data source ("quandl", "av-daily-adjusted", "iex", "fred", "ff", etc.)
    pause : float, optional
        Time, in seconds, to pause between consecutive queries of chunks

    Warns
    -----
    On remote data error or invalid ticker. If using AlphaVantage API, extra
    pause is made in those cases (see notes).

    Yields
    ------
    ticker : str
        Short name for the extract data
    data : dataframe
        A dataframe with all extract data
    metadata : dict
        Extra information about the extract data

    Notes
    -----
    Index is forced to datetime.

    Attempt is made not to go over AlphaVantage API limits (maximum of five
    calls per minute for non-premium accounts). This means that some pauses
    are made, but they are not on the conservative side so be aware of remote
    data errors and try again latter.

    Examples
    --------
    >>> for ticker, data, metadata in extract_datareader("^BVSP"):
    ...     print(ticker, metadata)
    ^BVSP {'price_column': 'adjusted close'}
    >>> for ticker, data, metadata in extract_datareader(["PETR4.SAO", "ITUB4.SAO"], pause=1.0):
    ...     print(ticker, metadata)
    PETR4.SAO {'price_column': 'adjusted close'}
    ITUB4.SAO {'price_column': 'adjusted close'}
    >>> for ticker, data, metadata in extract_datareader("BCB/11", data_source="quandl"):
    ...     print(ticker, metadata)
    BCB/11 {'price_column': 'Value'}

    """
    global _last_api_call

    if isinstance(tickers, str):
        tickers = [tickers]

    if pause is None:
        if data_source == "av-daily-adjusted":
            pause = 12.0  # max. 5 calls per minute for non-premium accounts
        else:
            pause = 1.0
    extra_pause = 0.25 * pause

    if data_source == "quandl":
        metadata = {"price_column": "Value"}
    else:
        metadata = {"price_column": "adjusted close"}
    for ticker in tickers:
        end = timer()
        if end - _last_api_call > pause:
            _last_api_call = end
        else:
            time_interval = end - _last_api_call + np.random.uniform(
                0.0, extra_pause)
            logging.info(f"Waiting {time_interval:.3f} seconds.")

            time.sleep(time_interval)
            _last_api_call = timer()

        logging.info(f"Attempting to retrieve {ticker}.")

        try:
            # TODO: support start_date and end_date
            data = web.DataReader(ticker, data_source)
        except ValueError as e:
            logging.warning(e)

            time_interval = np.random.uniform(0.0, 2.0 * pause)
            logging.info(f"Waiting {time_interval:.3f} extra seconds.")

            time.sleep(time_interval)
            continue
        except pandas_datareader._utils.RemoteDataError as e:
            logging.warning(f"Remote data error{e} for {ticker}.")

            time_interval = np.random.uniform(0.0, 2.0 * pause)
            logging.info(f"Waiting {time_interval:.3f} extra seconds.")

            time.sleep(time_interval)
            requests_cache.get_cache().remove_old_entries(
                datetime.datetime.now())
            continue
        data.index = pd.to_datetime(data.index)

        yield ticker, data, metadata
Example #36
0
from pandas import read_csv
import time
from datetime import datetime
import json

from biothings.utils.common import open_anyfile
from biothings import config
logger = config.logger

import requests_cache

expire_after = datetime.timedelta(days=7)

requests_cache.install_cache('litcovidtopics_cache', expire_after=expire_after)
logger.debug("requests_cache: %s",
             requests_cache.get_cache().responses.filename)


def get_pmids(res):
    data = []
    litcovid_data = res.text.split('\n')[34:]
    for line in litcovid_data:
        if line.startswith('#') or line.startswith('p'):
            continue
        if len(line.strip()) < 5:
            continue
        data.append('pmid' + line.split('\t')[0])
    return (data)


def get_topics():
Example #37
0
def schema_validate(instance, options):
    """Perform STIX JSON Schema validation against the input JSON.
    Find the correct schema by looking at the 'type' property of the
    `instance` JSON object.

    Args:
        instance: A STIX JSON string.
        options: ValidationOptions instance with validation options for this
            validation run.

    Returns:
        A dictionary of validation results

    """
    if 'type' not in instance:
        raise ValidationError(
            "Input must be an object with a 'type' property.")

    # Find and load the schema
    try:
        schema_path = find_schema(options.schema_dir, instance['type'])
        schema = load_schema(schema_path)
    except (KeyError, TypeError):
        # Assume a custom object with no schema
        try:
            schema_path = find_schema(options.schema_dir, 'core')
            schema = load_schema(schema_path)
        except (KeyError, TypeError):
            raise SchemaInvalidError("Cannot locate a schema for the object's "
                                     "type, nor the base schema (core.json).")

    # Validate against schemas for specific object types later
    if instance['type'] == 'bundle':
        schema['properties']['objects'] = {
            "objects": {
                "type": "array",
                "minItems": 1
            }
        }
    elif instance['type'] == 'observed-data':
        schema['allOf'][1]['properties']['objects'] = {
            "objects": {
                "type": "object",
                "minProperties": 1
            }
        }

    # Validate the schema first
    try:
        CustomDraft4Validator.check_schema(schema)
    except schema_exceptions.SchemaError as e:
        raise SchemaInvalidError('Invalid JSON schema: ' + str(e))

    # Cache data from external sources; used in some checks
    if not options.no_cache:
        requests_cache.install_cache(expire_after=datetime.timedelta(weeks=1))
    if options.refresh_cache:
        now = datetime.datetime.utcnow()
        requests_cache.get_cache().remove_old_entries(now)

    validator = load_validator(schema_path, schema, options)
    output.info("Running the following additional checks: %s." %
                ", ".join(x.__name__ for x in validator.get_list()))

    # Actual validation of JSON document
    try:
        some_errors = validator.iter_errors(instance)
        more_errors = validator.iter_errors_more(instance)
        warnings = validator.iter_errors_more(instance, False)

        if options.strict:
            chained_errors = chain(some_errors, more_errors, warnings)
            warnings = []
        else:
            chained_errors = chain(some_errors, more_errors)
            warnings = [pretty_error(x, options.verbose) for x in warnings]
    except schema_exceptions.RefResolutionError:
        raise SchemaInvalidError('Invalid JSON schema: a JSON reference '
                                 'failed to resolve')

    # List of error generators and message prefixes (to denote which object the
    # error comes from)
    error_gens = [(chained_errors, '')]

    # Validate each object in a bundle separately
    if instance['type'] == 'bundle' and 'objects' in instance:
        for sdo in instance['objects']:
            object_validate(sdo, options, error_gens)
    else:
        object_validate(instance, options, error_gens)

    # Clear requests cache if commandline flag was set
    if options.clear_cache:
        now = datetime.datetime.utcnow()
        requests_cache.get_cache().remove_old_entries(now)

    # Prepare the list of errors
    error_list = []
    for gen, prefix in error_gens:
        for error in gen:
            msg = prefix + pretty_error(error, options.verbose)
            error_list.append(SchemaError(msg))

    if error_list:
        valid = False
    else:
        valid = True

    return ValidationResults(is_valid=valid,
                             errors=error_list,
                             warnings=warnings)
Example #38
0
 def install_cache(path='cache'):
     """Activates cache located at path"""
     try:
         requests_cache.get_cache()
     except AttributeError:
         requests_cache.install_cache(path)
Example #39
0
def generate_csl_items(
    citekeys: list,
    manual_refs: dict = {},
    requests_cache_path: Optional[str] = None,
    clear_requests_cache: Optional[bool] = False,
) -> list:
    """
    General CSL (citeproc) items for standard_citekeys in citekeys_df.

    Parameters:

    - citekeys: list of standard_citekeys
    - manual_refs: mapping from standard_citekey to csl_item for manual references
    - requests_cache_path: path for the requests cache database.
      Passed as cache_name to `requests_cache.install_cache`.
      requests_cache may append an extension to this path, so it is not always the exact
      path to the cache. If None, do not use requests_cache.
    - clear_requests_cache: If True, clear the requests cache before generating citekey metadata.
    """
    # Deduplicate citations
    citekeys = list(dict.fromkeys(citekeys))

    # Install cache
    if requests_cache_path is not None:
        requests  # require `import requests` in case this is essential for monkey patching by requests_cache.
        requests_cache.install_cache(requests_cache_path,
                                     include_get_headers=True)
        cache = requests_cache.get_cache()
        if clear_requests_cache:
            logging.info("Clearing requests-cache")
            requests_cache.clear()
        logging.info(
            f"requests-cache starting with {len(cache.responses)} cached responses"
        )

    csl_items = list()
    failures = list()
    for standard_citekey in citekeys:
        if standard_citekey in manual_refs:
            csl_items.append(manual_refs[standard_citekey])
            continue
        elif standard_citekey.startswith("raw:"):
            logging.error(
                f"CSL JSON Data with a standard_citekey of {standard_citekey!r} not found in manual-references.json. "
                "Metadata must be provided for raw citekeys.")
            failures.append(standard_citekey)
        try:
            csl_item = citekey_to_csl_item(standard_citekey)
            csl_items.append(csl_item)
        except Exception:
            logging.exception(
                f"Citeproc retrieval failure for {standard_citekey!r}")
            failures.append(standard_citekey)

    # Uninstall cache
    if requests_cache_path is not None:
        logging.info(
            f"requests-cache finished with {len(cache.responses)} cached responses"
        )
        requests_cache.uninstall_cache()

    if failures:
        message = "CSL JSON Data retrieval failed for the following standardized citation keys:\n{}".format(
            "\n".join(failures))
        logging.error(message)

    return csl_items
Example #40
0
from datetime import datetime, timedelta

import requests
import requests_cache

requests_cache.install_cache('demo_cache')
CACHE = requests_cache.get_cache()

from django.core.management.base import BaseCommand
from django.utils.text import slugify

from organisations.models import (Organisation, OrganisationDivision,
                                  OrganisationDivisionSet)
from organisations.constants import PARENT_TO_CHILD_AREAS


class Command(BaseCommand):
    skip_gss = [
        "E12000007",
        "W08000001"  # See import_welsh_areas
    ]
    BASE = "http://mapit.mysociety.org"

    def add_arguments(self, parser):
        parser.add_argument('--always_pick_option',
                            action='store',
                            type=int,
                            default=0)

    def handle(self, **options):
        self.always_pick_option = int(options['always_pick_option'])
Example #41
0
 def delete_cached_url(url):
     """Deletes the given URL from the cache"""
     try:
         requests_cache.get_cache().delete_url(url)
     except AttributeError:
         pass
Example #42
0
def clear_requests_cache():
    """
    Clears all cached responses.
    """
    now = datetime.datetime.utcnow()
    requests_cache.get_cache().remove_old_entries(now)
Example #43
0
import requests_cache

requests_cache.configure("example_cache")


def main():
    # Once cached, delayed page will be taken from cache
    # redirects also handled
    for i in range(5):
        requests.get("http://httpbin.org/delay/2")
        r = requests.get("http://httpbin.org/redirect/5")
        print(r.text)

    # What about async? It's also supported!
    rs = [async.get("http://httpbin.org/delay/%s" % i) for i in range(5)]
    for r in async.map(rs):
        print(r.text)

    # And if we need to get fresh page or don't want to cache it?
    with requests_cache.disabled():
        print(requests.get("http://httpbin.org/ip").text)

    # Debugging info about cache
    print(requests_cache.get_cache())


if __name__ == "__main__":
    t = time.time()
    main()
    print("Elapsed: %.3f seconds" % (time.time() - t))