Example #1
0
def get(url, **kwargs):
    response = get_retries.get(
        f"http://archive.org/wayback/available?url={url.split('?')[0]}", **{
            'max_backoff': 128,
            **kwargs
        })

    if not response or response.status_code != 200:
        return None

    r_json = response.json()

    if not 'closest' in r_json['archived_snapshots']:
        return None

    clo = r_json['archived_snapshots']['closest']
    if clo['status'] != '200':
        return None

    response_final = get_retries.get(clo['url'], **{
        'max_backoff': 128,
        **kwargs
    })
    if not response_final or response_final.status_code != 200:
        return None

    return response_final
Example #2
0
def test_verbose_false(capsys):
    get_retries.get('https://httpbin.org/delay/10',
                    timeout=5,
                    verbose=False,
                    max_backoff=4)
    out, _ = capsys.readouterr()
    print(out)
    assert (len(out) == 0)
Example #3
0
def fetch(url):
    html_content = get_retries.get(url,
                                   headers=headers,
                                   verbose=True,
                                   max_backoff=128).text
    soup = BeautifulSoup(html_content, "lxml")
    return soup
Example #4
0
def dl(url, fn):
    response = get_retries.get(url,
                               headers=headers,
                               verbose=True,
                               max_backoff=128)

    # sometimes there is an additional HTML page
    if "text/html" in response.headers["content-type"]:
        print(url)
        soup = BeautifulSoup(response.text, "lxml")
        link = soup.find("a", class_="Publication")
        if link is None:
            link = soup.find("a", class_="downloadLink")

        url = urljoin(base_url, link["href"])
        dl(url, fn)

    assert "application/pdf" in response.headers["content-type"], (
        url + response.headers["content-type"])
    with open("pdfs/" + fn, "wb") as f:
        try:
            f.write(response.content)
        except Exception as e:
            print(e, url, response)
Example #5
0
def fetch(url):
    headers = {
        "User-Agent":
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36"
    }
    return get_retries.get(url, headers=headers)
Example #6
0
tab_chronicles.upsert(
    {
        "iso3166_1": "DE",
        "iso3166_2": "DE-TH",
        "chronicler_name": "ezra",
        "chronicler_description":
        "ezra ist die Beratung für Betroffene rechter, rassistischer und antisemitischer Gewalt in Thüringen. Wir beraten, begleiten und unterstützen Menschen, die aus Motiven gruppenbezogener Menschenfeindlichkeit angegriffen werden – also deshalb, weil die Täter*innen sie einer von ihnen abgelehnten Personengruppe zuordnen. Daneben richtet sich unser Angebot auch an Angehörige von Betroffenen und an Zeug*innen.",
        "chronicler_url": "https://ezra.de/",
        "chronicle_source": "https://ezra.de/chronik/",
    },
    ["chronicler_name"],
)

url = "https://angstraeume.ezra.de/wp-json/ezra/v1/chronic"

json_data = get_retries.get(url, verbose=True, max_backoff=128).json()

meta_motives = json_data["meta"]["motives"]
meta_locations = json_data["meta"]["locations"]

for x in json_data["entries"]:
    city = x["locationDisplay"]

    county = meta_locations[str(
        x["locations"][0])] if len(x["locations"]) > 0 else None

    date = parse(x["startDisplay"], languages=["de"])
    title = x["title"]
    description = BeautifulSoup(x["content"], "lxml").get_text().strip()
    rg_id = "ezra-" + str(x["id"])
    motives = ", ".join([meta_motives[str(xx)] for xx in x["motives"]])
Example #7
0
def test_200():
    res = get_retries.get('https://httpbin.org/status/200')
    assert (res.status_code == 200)
Example #8
0
def test_timeout_success():
    res = get_retries.get('https://httpbin.org/delay/10', timeout=11)
    assert (not res is None)
Example #9
0
def test_timeout():
    res = get_retries.get('https://httpbin.org/delay/10', timeout=5)
    assert (res is None)
Example #10
0
def test_400():
    start = timer()
    res = get_retries.get('https://httpbin.org/status/400', max_backoff=8)
    assert (res is None)
    assert (timer() - start < 15)