def get(url, **kwargs): response = get_retries.get( f"http://archive.org/wayback/available?url={url.split('?')[0]}", **{ 'max_backoff': 128, **kwargs }) if not response or response.status_code != 200: return None r_json = response.json() if not 'closest' in r_json['archived_snapshots']: return None clo = r_json['archived_snapshots']['closest'] if clo['status'] != '200': return None response_final = get_retries.get(clo['url'], **{ 'max_backoff': 128, **kwargs }) if not response_final or response_final.status_code != 200: return None return response_final
def test_verbose_false(capsys): get_retries.get('https://httpbin.org/delay/10', timeout=5, verbose=False, max_backoff=4) out, _ = capsys.readouterr() print(out) assert (len(out) == 0)
def fetch(url): html_content = get_retries.get(url, headers=headers, verbose=True, max_backoff=128).text soup = BeautifulSoup(html_content, "lxml") return soup
def dl(url, fn): response = get_retries.get(url, headers=headers, verbose=True, max_backoff=128) # sometimes there is an additional HTML page if "text/html" in response.headers["content-type"]: print(url) soup = BeautifulSoup(response.text, "lxml") link = soup.find("a", class_="Publication") if link is None: link = soup.find("a", class_="downloadLink") url = urljoin(base_url, link["href"]) dl(url, fn) assert "application/pdf" in response.headers["content-type"], ( url + response.headers["content-type"]) with open("pdfs/" + fn, "wb") as f: try: f.write(response.content) except Exception as e: print(e, url, response)
def fetch(url): headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36" } return get_retries.get(url, headers=headers)
tab_chronicles.upsert( { "iso3166_1": "DE", "iso3166_2": "DE-TH", "chronicler_name": "ezra", "chronicler_description": "ezra ist die Beratung für Betroffene rechter, rassistischer und antisemitischer Gewalt in Thüringen. Wir beraten, begleiten und unterstützen Menschen, die aus Motiven gruppenbezogener Menschenfeindlichkeit angegriffen werden – also deshalb, weil die Täter*innen sie einer von ihnen abgelehnten Personengruppe zuordnen. Daneben richtet sich unser Angebot auch an Angehörige von Betroffenen und an Zeug*innen.", "chronicler_url": "https://ezra.de/", "chronicle_source": "https://ezra.de/chronik/", }, ["chronicler_name"], ) url = "https://angstraeume.ezra.de/wp-json/ezra/v1/chronic" json_data = get_retries.get(url, verbose=True, max_backoff=128).json() meta_motives = json_data["meta"]["motives"] meta_locations = json_data["meta"]["locations"] for x in json_data["entries"]: city = x["locationDisplay"] county = meta_locations[str( x["locations"][0])] if len(x["locations"]) > 0 else None date = parse(x["startDisplay"], languages=["de"]) title = x["title"] description = BeautifulSoup(x["content"], "lxml").get_text().strip() rg_id = "ezra-" + str(x["id"]) motives = ", ".join([meta_motives[str(xx)] for xx in x["motives"]])
def test_200(): res = get_retries.get('https://httpbin.org/status/200') assert (res.status_code == 200)
def test_timeout_success(): res = get_retries.get('https://httpbin.org/delay/10', timeout=11) assert (not res is None)
def test_timeout(): res = get_retries.get('https://httpbin.org/delay/10', timeout=5) assert (res is None)
def test_400(): start = timer() res = get_retries.get('https://httpbin.org/status/400', max_backoff=8) assert (res is None) assert (timer() - start < 15)