def test_url_to_filename_with_etags(self): for url in ['http://allenai.org', 'http://allennlp.org', 'https://www.google.com', 'http://pytorch.org']: filename = url_to_filename(url, etag="mytag") assert "http" not in filename pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch() json.dump({'url': url, 'etag': 'mytag'}, open(os.path.join(self.TEST_DIR, filename + '.json'), 'w')) back_to_url, etag = filename_to_url(filename, cache_dir=self.TEST_DIR) assert back_to_url == url assert etag == "mytag" baseurl = 'http://allenai.org/' assert url_to_filename(baseurl + '1') != url_to_filename(baseurl, etag='1')
def test_cached_path(self): url = 'http://fake.datastore.com/glove.txt.gz' set_up_glove(url, self.glove_bytes) # non-existent file with pytest.raises(FileNotFoundError): filename = cached_path(self.FIXTURES_ROOT / "does_not_exist" / "fake_file.tar.gz") # unparsable URI with pytest.raises(ValueError): filename = cached_path("fakescheme://path/to/fake/file.tar.gz") # existing file as path assert cached_path(self.glove_file) == str(self.glove_file) # caches urls filename = cached_path(url, cache_dir=self.TEST_DIR) assert len(responses.calls) == 2 assert filename == os.path.join(self.TEST_DIR, url_to_filename(url, etag="0")) with open(filename, 'rb') as cached_file: assert cached_file.read() == self.glove_bytes
def test_cached_path(self): url = "http://fake.datastore.com/glove.txt.gz" set_up_glove(url, self.glove_bytes) # non-existent file with pytest.raises(FileNotFoundError): filename = cached_path(self.FIXTURES_ROOT / "does_not_exist" / "fake_file.tar.gz") # unparsable URI with pytest.raises(ValueError): filename = cached_path("fakescheme://path/to/fake/file.tar.gz") # existing file as path assert cached_path(self.glove_file) == str(self.glove_file) # caches urls filename = cached_path(url, cache_dir=self.TEST_DIR) assert len(responses.calls) == 2 assert filename == os.path.join(self.TEST_DIR, url_to_filename(url, etag="0")) with open(filename, "rb") as cached_file: assert cached_file.read() == self.glove_bytes # archives filename = cached_path( self.FIXTURES_ROOT / "common" / "quote.tar.gz!quote.txt", extract_archive=True ) with open(filename, "r") as f: assert f.read().startswith("I mean, ")
def test_url_to_filename(self): for url in [ "http://allenai.org", "http://allennlp.org", "https://www.google.com", "http://pytorch.org", "https://allennlp.s3.amazonaws.com" + "/long" * 20 + "/url", ]: filename = url_to_filename(url) assert "http" not in filename with pytest.raises(FileNotFoundError): filename_to_url(filename, cache_dir=self.TEST_DIR) pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch() with pytest.raises(FileNotFoundError): filename_to_url(filename, cache_dir=self.TEST_DIR) json.dump( { "url": url, "etag": None }, open(os.path.join(self.TEST_DIR, filename + ".json"), "w"), ) back_to_url, etag = filename_to_url(filename, cache_dir=self.TEST_DIR) assert back_to_url == url assert etag is None
def test_get_from_cache(self): url = 'http://fake.datastore.com/glove.txt.gz' set_up_glove(url, self.glove_bytes, change_etag_every=2) filename = get_from_cache(url, cache_dir=self.TEST_DIR) assert filename == os.path.join(self.TEST_DIR, url_to_filename(url, etag="0")) # We should have made one HEAD request and one GET request. method_counts = Counter(call.request.method for call in responses.calls) assert len(method_counts) == 2 assert method_counts['HEAD'] == 1 assert method_counts['GET'] == 1 # And the cached file should have the correct contents with open(filename, 'rb') as cached_file: assert cached_file.read() == self.glove_bytes # A second call to `get_from_cache` should make another HEAD call # but not another GET call. filename2 = get_from_cache(url, cache_dir=self.TEST_DIR) assert filename2 == filename method_counts = Counter(call.request.method for call in responses.calls) assert len(method_counts) == 2 assert method_counts['HEAD'] == 2 assert method_counts['GET'] == 1 with open(filename2, 'rb') as cached_file: assert cached_file.read() == self.glove_bytes # A third call should have a different ETag and should force a new download, # which means another HEAD call and another GET call. filename3 = get_from_cache(url, cache_dir=self.TEST_DIR) assert filename3 == os.path.join(self.TEST_DIR, url_to_filename(url, etag="1")) method_counts = Counter(call.request.method for call in responses.calls) assert len(method_counts) == 2 assert method_counts['HEAD'] == 3 assert method_counts['GET'] == 2 with open(filename3, 'rb') as cached_file: assert cached_file.read() == self.glove_bytes
def test_cached_path_offline(self, monkeypatch): # Ensures `cached_path` just returns the path to the latest cached version # of the resource when there's no internet connection. # First we mock the `_http_etag` method so that it raises a `ConnectionError`, # like it would if there was no internet connection. def mocked_http_etag(url: str): raise ConnectionError monkeypatch.setattr(file_utils, "_http_etag", mocked_http_etag) url = "https://github.com/allenai/allennlp/blob/master/some-fake-resource" # We'll create two cached versions of this fake resource using two different etags. etags = [ 'W/"3e5885bfcbf4c47bc4ee9e2f6e5ea916"', 'W/"3e5885bfcbf4c47bc4ee9e2f6e5ea918"' ] filenames = [ os.path.join(self.TEST_DIR, url_to_filename(url, etag)) for etag in etags ] for filename, etag in zip(filenames, etags): meta_filename = filename + ".json" with open(filename, "w") as f: f.write("some random data") with open(meta_filename, "w") as meta_f: json.dump({"url": url, "etag": etag}, meta_f) # os.path.getmtime is only accurate to the second. time.sleep(1.1) # The version corresponding to the last etag should be returned, since # that one has the latest "last modified" time. assert get_from_cache(url, cache_dir=self.TEST_DIR) == filenames[-1] # We also want to make sure this works when the latest cached version doesn't # have a corresponding etag. filename = os.path.join(self.TEST_DIR, url_to_filename(url)) meta_filename = filename + ".json" with open(filename, "w") as f: f.write("some random data") with open(meta_filename, "w") as meta_f: json.dump({"url": url, "etag": etag}, meta_f) assert get_from_cache(url, cache_dir=self.TEST_DIR) == filename
def test_url_to_filename_with_etags_eliminates_quotes(self): for url in ['http://allenai.org', 'http://allennlp.org', 'https://www.google.com', 'http://pytorch.org']: filename = url_to_filename(url, etag='"mytag"') assert "http" not in filename pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch() back_to_url, etag = filename_to_url(filename) assert back_to_url == url assert etag == "mytag"
def test_url_to_filename(self): for url in ['http://allenai.org', 'http://allennlp.org', 'https://www.google.com', 'http://pytorch.org']: filename = url_to_filename(url) assert "http" not in filename pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch() back_to_url, etag = filename_to_url(filename) assert back_to_url == url assert etag is None
def test_url_to_filename_with_etags(self): for url in [ "http://allenai.org", "http://allennlp.org", "https://www.google.com", "http://pytorch.org", ]: filename = url_to_filename(url, etag="mytag") assert "http" not in filename pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch() json.dump( {"url": url, "etag": "mytag"}, open(os.path.join(self.TEST_DIR, filename + ".json"), "w"), ) back_to_url, etag = filename_to_url(filename, cache_dir=self.TEST_DIR) assert back_to_url == url assert etag == "mytag" baseurl = "http://allenai.org/" assert url_to_filename(baseurl + "1") != url_to_filename(baseurl, etag="1")
def test_url_to_filename_with_etags_eliminates_quotes(self): for url in [u'http://allenai.org', u'http://allennlp.org', u'https://www.google.com', u'http://pytorch.org']: filename = url_to_filename(url, etag=u'"mytag"') assert u"http" not in filename pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch() json.dump({u'url': url, u'etag': u'mytag'}, open(os.path.join(self.TEST_DIR, filename + u'.json'), u'w')) back_to_url, etag = filename_to_url(filename, cache_dir=self.TEST_DIR) assert back_to_url == url assert etag == u"mytag"
def test_url_to_filename(self): for url in [ 'http://allenai.org', 'http://allennlp.org', 'https://www.google.com', 'http://pytorch.org' ]: filename = url_to_filename(url) assert "http" not in filename pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch() back_to_url, etag = filename_to_url(filename) assert back_to_url == url assert etag is None
def test_url_to_filename_with_etags_eliminates_quotes(self): for url in [ 'http://allenai.org', 'http://allennlp.org', 'https://www.google.com', 'http://pytorch.org' ]: filename = url_to_filename(url, etag='"mytag"') assert "http" not in filename pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch() back_to_url, etag = filename_to_url(filename) assert back_to_url == url assert etag == "mytag"
def test_url_to_filename(self): for url in ['http://allenai.org', 'http://allennlp.org', 'https://www.google.com', 'http://pytorch.org', 'https://s3-us-west-2.amazonaws.com/allennlp' + '/long' * 20 + '/url']: filename = url_to_filename(url) assert "http" not in filename with pytest.raises(FileNotFoundError): filename_to_url(filename, cache_dir=self.TEST_DIR) pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch() with pytest.raises(FileNotFoundError): filename_to_url(filename, cache_dir=self.TEST_DIR) json.dump({'url': url, 'etag': None}, open(os.path.join(self.TEST_DIR, filename + '.json'), 'w')) back_to_url, etag = filename_to_url(filename, cache_dir=self.TEST_DIR) assert back_to_url == url assert etag is None
def test_cached_path(self): url = 'http://fake.datastore.com/glove.txt.gz' set_up_glove(url, self.glove_bytes) # non-existent file with pytest.raises(FileNotFoundError): filename = cached_path("tests/fixtures/does_not_exist/fake_file.tar.gz") # unparsable URI with pytest.raises(ValueError): filename = cached_path("fakescheme://path/to/fake/file.tar.gz") # existing file as path assert cached_path(self.glove_file) == self.glove_file # caches urls filename = cached_path(url, cache_dir=self.TEST_DIR) assert len(responses.calls) == 2 assert filename == os.path.join(self.TEST_DIR, url_to_filename(url, etag="0")) with open(filename, 'rb') as cached_file: assert cached_file.read() == self.glove_bytes