Beispiel #1
0
    def test_cached_path_offline(self, monkeypatch):
        # Ensures `cached_path` just returns the path to the latest cached version
        # of the resource when there's no internet connection.

        # First we mock the `_http_etag` method so that it raises a `ConnectionError`,
        # like it would if there was no internet connection.
        def mocked_http_etag(url: str):
            raise ConnectionError

        monkeypatch.setattr(file_utils, "_http_etag", mocked_http_etag)

        url = "https://github.com/allenai/allennlp/blob/master/some-fake-resource"

        # We'll create two cached versions of this fake resource using two different etags.
        etags = [
            'W/"3e5885bfcbf4c47bc4ee9e2f6e5ea916"',
            'W/"3e5885bfcbf4c47bc4ee9e2f6e5ea918"'
        ]
        filenames = [
            os.path.join(self.TEST_DIR, _resource_to_filename(url, etag))
            for etag in etags
        ]
        for filename, etag in zip(filenames, etags):
            meta = _Meta(resource=url,
                         cached_path=filename,
                         creation_time=time.time(),
                         etag=etag,
                         size=2341)
            meta.to_file()
            with open(filename, "w") as f:
                f.write("some random data")
            # os.path.getmtime is only accurate to the second.
            time.sleep(1.1)

        # Should know to ignore lock files and extraction directories.
        with open(filenames[-1] + ".lock", "w") as f:
            f.write("")
        os.mkdir(filenames[-1] + "-extracted")

        # The version corresponding to the last etag should be returned, since
        # that one has the latest "last modified" time.
        assert get_from_cache(url, cache_dir=self.TEST_DIR) == filenames[-1]

        # We also want to make sure this works when the latest cached version doesn't
        # have a corresponding etag.
        filename = os.path.join(self.TEST_DIR, _resource_to_filename(url))
        meta = _Meta(resource=url,
                     cached_path=filename,
                     creation_time=time.time(),
                     size=2341)
        with open(filename, "w") as f:
            f.write("some random data")

        assert get_from_cache(url, cache_dir=self.TEST_DIR) == filename
Beispiel #2
0
    def test_get_from_cache(self):
        url = "http://fake.datastore.com/glove.txt.gz"
        set_up_glove(url, self.glove_bytes, change_etag_every=2)

        filename = get_from_cache(url, cache_dir=self.TEST_DIR)
        assert filename == os.path.join(self.TEST_DIR,
                                        _resource_to_filename(url, etag="0"))
        assert os.path.exists(filename + ".json")
        meta = _Meta.from_path(filename + ".json")
        assert meta.resource == url

        # We should have made one HEAD request and one GET request.
        method_counts = Counter(call.request.method
                                for call in responses.calls)
        assert len(method_counts) == 2
        assert method_counts["HEAD"] == 1
        assert method_counts["GET"] == 1

        # And the cached file should have the correct contents
        with open(filename, "rb") as cached_file:
            assert cached_file.read() == self.glove_bytes

        # A second call to `get_from_cache` should make another HEAD call
        # but not another GET call.
        filename2 = get_from_cache(url, cache_dir=self.TEST_DIR)
        assert filename2 == filename

        method_counts = Counter(call.request.method
                                for call in responses.calls)
        assert len(method_counts) == 2
        assert method_counts["HEAD"] == 2
        assert method_counts["GET"] == 1

        with open(filename2, "rb") as cached_file:
            assert cached_file.read() == self.glove_bytes

        # A third call should have a different ETag and should force a new download,
        # which means another HEAD call and another GET call.
        filename3 = get_from_cache(url, cache_dir=self.TEST_DIR)
        assert filename3 == os.path.join(self.TEST_DIR,
                                         _resource_to_filename(url, etag="1"))

        method_counts = Counter(call.request.method
                                for call in responses.calls)
        assert len(method_counts) == 2
        assert method_counts["HEAD"] == 3
        assert method_counts["GET"] == 2

        with open(filename3, "rb") as cached_file:
            assert cached_file.read() == self.glove_bytes
    def test_get_from_cache(self):
        url = 'http://fake.datastore.com/glove.txt.gz'
        set_up_glove(url, self.glove_bytes, change_etag_every=2)

        filename = get_from_cache(url, cache_dir=self.TEST_DIR)
        assert filename == os.path.join(self.TEST_DIR, url_to_filename(url, etag="0"))

        # We should have made one HEAD request and one GET request.
        method_counts = Counter(call.request.method for call in responses.calls)
        assert len(method_counts) == 2
        assert method_counts['HEAD'] == 1
        assert method_counts['GET'] == 1

        # And the cached file should have the correct contents
        with open(filename, 'rb') as cached_file:
            assert cached_file.read() == self.glove_bytes

        # A second call to `get_from_cache` should make another HEAD call
        # but not another GET call.
        filename2 = get_from_cache(url, cache_dir=self.TEST_DIR)
        assert filename2 == filename

        method_counts = Counter(call.request.method for call in responses.calls)
        assert len(method_counts) == 2
        assert method_counts['HEAD'] == 2
        assert method_counts['GET'] == 1

        with open(filename2, 'rb') as cached_file:
            assert cached_file.read() == self.glove_bytes

        # A third call should have a different ETag and should force a new download,
        # which means another HEAD call and another GET call.
        filename3 = get_from_cache(url, cache_dir=self.TEST_DIR)
        assert filename3 == os.path.join(self.TEST_DIR, url_to_filename(url, etag="1"))

        method_counts = Counter(call.request.method for call in responses.calls)
        assert len(method_counts) == 2
        assert method_counts['HEAD'] == 3
        assert method_counts['GET'] == 2

        with open(filename3, 'rb') as cached_file:
            assert cached_file.read() == self.glove_bytes
Beispiel #4
0
def claim_reader(db, in_file_path):
    logger.info("Reading claims from {}".format(in_file_path))
    downloaded_file = file_utils.get_from_cache(in_file_path)

    with open(downloaded_file, "r") as in_file:
        for line in in_file:
            instance = json.loads(line)

            if instance["label"] != "NOT ENOUGH INFO":
                evidence_groups = read_evidence(db, instance)
                instance["evidence_groups"] = evidence_groups
                yield instance
Beispiel #5
0
    def test_cached_path_offline(self, monkeypatch):
        # Ensures `cached_path` just returns the path to the latest cached version
        # of the resource when there's no internet connection.

        # First we mock the `_http_etag` method so that it raises a `ConnectionError`,
        # like it would if there was no internet connection.
        def mocked_http_etag(url: str):
            raise ConnectionError

        monkeypatch.setattr(file_utils, "_http_etag", mocked_http_etag)

        url = "https://github.com/allenai/allennlp/blob/master/some-fake-resource"

        # We'll create two cached versions of this fake resource using two different etags.
        etags = ['W/"3e5885bfcbf4c47bc4ee9e2f6e5ea916"', 'W/"3e5885bfcbf4c47bc4ee9e2f6e5ea918"']
        filenames = [os.path.join(self.TEST_DIR, url_to_filename(url, etag)) for etag in etags]
        for filename, etag in zip(filenames, etags):
            meta_filename = filename + ".json"
            with open(filename, "w") as f:
                f.write("some random data")
            with open(meta_filename, "w") as meta_f:
                json.dump({"url": url, "etag": etag}, meta_f)

        # The version corresponding to the last etag should be returned, since
        # that one has the latest "last modified" time.
        assert get_from_cache(url, cache_dir=self.TEST_DIR) == filenames[-1]

        # We also want to make sure this works when the latest cached version doesn't
        # have a corresponding etag.
        filename = os.path.join(self.TEST_DIR, url_to_filename(url))
        meta_filename = filename + ".json"
        with open(filename, "w") as f:
            f.write("some random data")
        with open(meta_filename, "w") as meta_f:
            json.dump({"url": url, "etag": etag}, meta_f)

        assert get_from_cache(url, cache_dir=self.TEST_DIR) == filename