def setup(self): self.heuristic = LastModified() self.time_now = time.time() day_in_seconds = 86400 self.year_ago = self.last_modified(day_in_seconds * 365) self.week_ago = self.last_modified(day_in_seconds * 7) self.day_ago = self.last_modified(day_in_seconds) self.now = self.last_modified(0) # NOTE: We pass in a negative to get a positive... Probably # should refactor. self.day_ahead = self.last_modified(-day_in_seconds)
def __init__(self): self._API_SERVER = "https://data.nba.net" self._cache_control_adapter = CacheControlAdapter(heuristic=LastModified()) self._requests_session = requests.Session() self._requests_session.mount('http://', CacheControlAdapter()) self._requests_session.mount('https://', CacheControlAdapter()) self._TEAM_TRICODES = frozenset(('CHA', 'ATL', 'IND', 'MEM', 'DET', 'UTA', 'CHI', 'TOR', 'CLE', 'OKC', 'DAL', 'MIN', 'BOS', 'SAS', 'MIA', 'DEN', 'LAL', 'PHX', 'NOP', 'MIL', 'HOU', 'NYK', 'ORL', 'SAC', 'PHI', 'BKN', 'POR', 'GSW', 'LAC', 'WAS')) self._STAT_CATEGORIES = frozenset(('ppg', 'trpg', 'apg', 'fgp', 'ftp', 'tpp', 'bpg', 'spg', 'tpg', 'pfpg')) self._CONFERENCES = frozenset(('west', 'east')) self._EASTERN_DIVISIONS = frozenset(('southeast', 'atlantic', 'central')) self._WESTERN_DIVISIONS = frozenset(('southwest', 'pacific', 'northwest')) self._DIVISIONS = {'west': self._WESTERN_DIVISIONS, 'east': self._EASTERN_DIVISIONS} # Cached dictionaries. Saving these copies avoids having to # re-parse JSONs when they are returned from the HTTP cache. self._person_ids = None self._team_ids_to_tricodes = None self._team_tricodes_to_ids = None
class Settings: do_update_wikidata = True # Don't activate this, it's most likely broken do_update_wikipedia = False sparql_file = "free_software_items.rq" oauth_token_file = "github_oauth_token.txt" # pywikibot is too stupid to cache the calendar model, so let's do this manually calendarmodel = pywikibot.Site().data_repository().calendarmodel() wikidata_repo = pywikibot.Site("wikidata", "wikidata").data_repository() repo_regex = re.compile(r"https://github.com/[^/]+/[^/]+") version_regex = re.compile(r"\d+(\.\d+)+") unmarked_prerelease_regex = re.compile( r"[ -._\d](b|r|rc|beta|alpha)([ .\d].*)?$", re.IGNORECASE) cached_session = CacheControl(requests.Session(), cache=FileCache('cache', forever=True), heuristic=LastModified()) properties = { "software version": "P348", "publication date": "P577", "retrieved": "P813", "reference URL": "P854", "official website": "P856", "source code repository": "P1324", }
class Settings: do_update_wikidata = True # Don't activate this, it's most likely broken do_update_wikipedia = False normalize_url = True sparql_file = "free_software_items.rq" # pywikibot is too stupid to cache the calendar model, so let's do this manually calendarmodel = pywikibot.Site().data_repository().calendarmodel() wikidata_repo = pywikibot.Site("wikidata", "wikidata").data_repository() repo_regex = re.compile(r"^[a-z]+://github.com/[^/]+/[^/]+/?$") cached_session = CacheControl( requests.Session(), cache=FileCache("cache", forever=True), heuristic=LastModified(), ) properties = { "software version": "P348", "publication date": "P577", "retrieved": "P813", "reference URL": "P854", "official website": "P856", "source code repository": "P1324", "title": "P1476", "protocol": "P2700", }
def __init__(self, uri: str = None, session: requests.Session = None, seed: str = None): # Airtable and gssutils are using slightly different field names.... self.meta_field_mapping = {"published": "issued"} # Add an explicit on/off for temp scraping (based on presence of dataURL) self.temp_scrape = False # Use seed if provided if seed is not None: with open(seed, "r") as f: self.seed = json.load(f) if "dataURL" in self.seed: logging.warning( "A temporary dataURL has been specified; proceeding with a temp scrape." ) self.temp_scrape = True if "landingPage" not in self.seed.keys(): raise MetadataError( 'We always need to provide a "landingPage" via the seed. Either' " it's own or alongside a dataURL for temporary scrapes." ) uri = self.seed["landingPage"] else: self.seed = None self.uri = uri self.dataset = pmdcat.Dataset(uri) self.catalog = dcat.Catalog() self.dataset.modified = datetime.now(timezone.utc).astimezone() self.distributions = [] if session: self.session = session elif "RECORD_MODE" in os.environ: # don't use cachecontrol, but we'll need to patch the session when used. self.session = requests.Session() else: self.session = CacheControl( requests.Session(), cache=FileCache(".cache"), serializer=BiggerSerializer(), heuristic=LastModified(), ) if "JOB_NAME" in os.environ: self._base_uri = URIRef("http://gss-data.org.uk") self._dataset_id = pathify(os.environ["JOB_NAME"]) else: self._base_uri = BNode() parsed_scrape_uri = urlparse(self.uri) self._dataset_id = (parsed_scrape_uri.netloc.replace(".", "/") + parsed_scrape_uri.path) self.update_dataset_uris() self._run()
def __init__(self, name: str = None, description: str = None, version: str = None): self.app_id = {'X-TBA-App-Id': ""} self.session = requests.Session() self.session = CacheControl(self.session, heuristic=LastModified()) self.session.headers.update(self.app_id) if name is not None: self.set_api_key(name, description, version)
def __init__(self, uri: str = None, session: requests.Session = None, seed: str = None): # Airtable and gssutils are using slightly different field names.... self.meta_field_mapping = { "published": "issued" } # Add an explicit on/off for temp scraping (based on presence of dataURL) self.temp_scrape = False # Use seed if provided if seed is not None: with open(seed, "r") as f: self.seed = json.load(f) if "dataURL" in self.seed: logging.warning("A temporary dataURL has been specified; proceeding with a temp scrape.") uri = self.seed["dataURL"] self.temp_scrape = True elif "landingPage" not in self.seed: raise MetadataError("Aborting, insufficient seed data. No landing page supplied via " "info.json and no dataURL to use as a fallback.") else: uri = self.seed["landingPage"] else: self.seed = None self.uri = uri self.dataset = pmdcat.Dataset(uri) self.catalog = dcat.Catalog() self.dataset.modified = datetime.now(timezone.utc).astimezone() self.distributions = [] if session: self.session = session else: self.session = CacheControl(requests.Session(), cache=FileCache('.cache'), serializer=BiggerSerializer(), heuristic=LastModified()) if 'JOB_NAME' in os.environ: self._base_uri = URIRef('http://gss-data.org.uk') self._dataset_id = pathify(os.environ['JOB_NAME']) else: self._base_uri = BNode() parsed_scrape_uri = urlparse(self.uri) self._dataset_id = parsed_scrape_uri.netloc.replace('.', '/') + parsed_scrape_uri.path self.update_dataset_uris() self._run()
def csv_dialect(fd): snippet = fd.read(1024).encode('utf-8') if PY2 else fd.read(1024) fd.seek(0) return csv.Sniffer().sniff(snippet) ### HTTP utils ### try: import requests from cachecontrol import CacheControl, CacheControlAdapter from cachecontrol.caches import FileCache from cachecontrol.heuristics import LastModified cache_dir = '%s/Library/Caches/PlotDevice'%os.environ['HOME'] HTTP = CacheControl(requests.Session(), cache=FileCache(cache_dir), heuristic=LastModified()) except ImportError: class Decoy(object): def get(self, url): unsupported = 'could not find the "requests" library (try running "python setup.py build" first)' raise RuntimeError(unsupported) HTTP = Decoy() def binaryish(content, format): bin_types = ('pdf','eps','png','jpg','jpeg','gif','tiff','tif','zip','tar','gz') bin_formats = ('raw','bytes','img','image') if any(b in content for b in bin_types): return True if format: return any(b in format for b in bin_types+bin_formats) return False
).downloadURL # In[119]: if is_interactive(): import requests from cachecontrol import CacheControl from cachecontrol.caches.file_cache import FileCache from cachecontrol.heuristics import LastModified from pathlib import Path session = CacheControl(requests.Session(), cache=FileCache('.cache'), heuristic=LastModified()) sourceFolder = Path('in') sourceFolder.mkdir(exist_ok=True) inputURL = 'https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/Alcohol_Tables_17.xls' inputFile = sourceFolder / 'Alcohol_Tables_17.xls' response = session.get(inputURL) with open(inputFile, 'wb') as f: f.write(response.content) tab = loadxlstabs(inputFile, sheetids='Table 2')[0] # In[120]:
class TestModifiedUnitTests(object): def last_modified(self, period): return time.strftime(TIME_FMT, time.gmtime(self.time_now - period)) def setup(self): self.heuristic = LastModified() self.time_now = time.time() day_in_seconds = 86400 self.year_ago = self.last_modified(day_in_seconds * 365) self.week_ago = self.last_modified(day_in_seconds * 7) self.day_ago = self.last_modified(day_in_seconds) self.now = self.last_modified(0) # NOTE: We pass in a negative to get a positive... Probably # should refactor. self.day_ahead = self.last_modified(-day_in_seconds) def test_no_expiry_is_inferred_when_no_last_modified_is_present(self): assert self.heuristic.update_headers(DummyResponse(200, {})) == {} def test_expires_is_not_replaced_when_present(self): resp = DummyResponse(200, {"Expires": self.day_ahead}) assert self.heuristic.update_headers(resp) == {} def test_last_modified_is_used(self): resp = DummyResponse(200, {"Date": self.now, "Last-Modified": self.week_ago}) modified = self.heuristic.update_headers(resp) assert ["expires"] == list(modified.keys()) assert datetime(*parsedate(modified["expires"])[:6]) > datetime.now() def test_last_modified_is_not_used_when_cache_control_present(self): resp = DummyResponse( 200, { "Date": self.now, "Last-Modified": self.week_ago, "Cache-Control": "private", }, ) assert self.heuristic.update_headers(resp) == {} def test_last_modified_is_not_used_when_status_is_unknown(self): resp = DummyResponse(299, {"Date": self.now, "Last-Modified": self.week_ago}) assert self.heuristic.update_headers(resp) == {} def test_last_modified_is_used_when_cache_control_public(self): resp = DummyResponse( 200, { "Date": self.now, "Last-Modified": self.week_ago, "Cache-Control": "public", }, ) modified = self.heuristic.update_headers(resp) assert ["expires"] == list(modified.keys()) assert datetime(*parsedate(modified["expires"])[:6]) > datetime.now() def test_warning_not_added_when_response_more_recent_than_24_hours(self): resp = DummyResponse(200, {"Date": self.now, "Last-Modified": self.week_ago}) assert self.heuristic.warning(resp) is None def test_warning_is_not_added_when_heuristic_was_not_used(self): resp = DummyResponse(200, {"Date": self.now, "Expires": self.day_ahead}) assert self.heuristic.warning(resp) is None def test_expiry_is_no_more_that_twenty_four_hours(self): resp = DummyResponse(200, {"Date": self.now, "Last-Modified": self.year_ago}) modified = self.heuristic.update_headers(resp) assert ["expires"] == list(modified.keys()) assert self.day_ahead == modified["expires"]
class TestModifiedUnitTests(object): def last_modified(self, period): return time.strftime(TIME_FMT, time.gmtime(self.time_now - period)) def setup(self): self.heuristic = LastModified() self.time_now = time.time() day_in_seconds = 86400 self.year_ago = self.last_modified(day_in_seconds * 365) self.week_ago = self.last_modified(day_in_seconds * 7) self.day_ago = self.last_modified(day_in_seconds) self.now = self.last_modified(0) # NOTE: We pass in a negative to get a positive... Probably # should refactor. self.day_ahead = self.last_modified(-day_in_seconds) def test_no_expiry_is_inferred_when_no_last_modified_is_present(self): assert self.heuristic.update_headers(DummyResponse(200, {})) == {} def test_expires_is_not_replaced_when_present(self): resp = DummyResponse(200, {"Expires": self.day_ahead}) assert self.heuristic.update_headers(resp) == {} def test_last_modified_is_used(self): resp = DummyResponse(200, { "Date": self.now, "Last-Modified": self.week_ago }) modified = self.heuristic.update_headers(resp) assert ["expires"] == list(modified.keys()) assert datetime(*parsedate(modified["expires"])[:6]) > datetime.now() def test_last_modified_is_not_used_when_cache_control_present(self): resp = DummyResponse( 200, { "Date": self.now, "Last-Modified": self.week_ago, "Cache-Control": "private", }, ) assert self.heuristic.update_headers(resp) == {} def test_last_modified_is_not_used_when_status_is_unknown(self): resp = DummyResponse(299, { "Date": self.now, "Last-Modified": self.week_ago }) assert self.heuristic.update_headers(resp) == {} def test_last_modified_is_used_when_cache_control_public(self): resp = DummyResponse( 200, { "Date": self.now, "Last-Modified": self.week_ago, "Cache-Control": "public", }, ) modified = self.heuristic.update_headers(resp) assert ["expires"] == list(modified.keys()) assert datetime(*parsedate(modified["expires"])[:6]) > datetime.now() def test_warning_not_added_when_response_more_recent_than_24_hours(self): resp = DummyResponse(200, { "Date": self.now, "Last-Modified": self.week_ago }) assert self.heuristic.warning(resp) is None def test_warning_is_not_added_when_heuristic_was_not_used(self): resp = DummyResponse(200, { "Date": self.now, "Expires": self.day_ahead }) assert self.heuristic.warning(resp) is None def test_expiry_is_no_more_that_twenty_four_hours(self): resp = DummyResponse(200, { "Date": self.now, "Last-Modified": self.year_ago }) modified = self.heuristic.update_headers(resp) assert ["expires"] == list(modified.keys()) assert self.day_ahead == modified["expires"]
def setup(self): self.sess = Session() self.cached_sess = CacheControl(self.sess, heuristic=LastModified())
import os, datetime import requests from .data.rfeed import Item, Feed from flask import Flask, jsonify, request as flask_request from cachecontrol import CacheControlAdapter from cachecontrol.heuristics import LastModified app = Flask(__name__) adapter = CacheControlAdapter(heuristic=LastModified()) sess = requests.Session() sess.mount('http://', adapter) sess.mount('https://', adapter) SERVICE_NAME = os.path.splitext(os.path.basename(__file__))[0] @app.route("/rss/summary", methods=['GET']) def latest_articles(): if flask_request.method == 'GET': response = sess.get('http://localhost/article/collect/10') article_collection = [] if response.status_code == requests.codes.ok: articles = response.json()['success'] for article in articles: article_collection.append( Item( title=article['title'], author=article['author'], pubDate=datetime.datetime.strptime(