def test_write_timeout_error(self): with pytest.raises(RemoteFileException): with RemoteFile(DefaultConfig(), "user/dataset", "file.txt", timeout=0.0) as writer: writer.write("test")
def test_read_basic(self): with responses.RequestsMock() as resp: def download_endpoint(request): assert request.headers.get('User-Agent') == _user_agent() return 200, {}, "this is the test." resp.add_callback(resp.GET, '{}/file_download/{}/{}/{}' .format('https://query.data.world', 'user', 'dataset', 'file.txt'), callback=download_endpoint) with RemoteFile(DefaultConfig(), "user/dataset", "file.txt", mode="r") as reader: contents = reader.read() assert "this is the test." == contents
def test_read_binary_bytes_iter(self): with responses.RequestsMock() as resp: def download_endpoint(request): assert request.headers.get('User-Agent') == _user_agent() return 200, {}, struct.pack('BBBB', 0, 1, 254, 255) resp.add_callback(resp.GET, '{}/file_download/{}/{}/{}' .format('https://query.data.world', 'user', 'dataset', 'file.txt'), callback=download_endpoint) with RemoteFile(DefaultConfig(), "user/dataset", "file.txt", mode="rb") as reader: contents = list(reader) assert [b"\x00", b"\x01", b"\xfe", b"\xff"] == contents
def test_write_error(self): with pytest.raises(RestApiError): with responses.RequestsMock() as resp: def upload_endpoint(request): assert request.headers.get('User-Agent') == _user_agent() return 400, {}, json.dumps({}) resp.add_callback(resp.PUT, '{}/uploads/{}/{}/files/{}' .format('https://api.data.world/v0', 'user', 'dataset', 'file.txt'), callback=upload_endpoint) with RemoteFile(DefaultConfig(), "user/dataset", "file.txt") as writer: writer.write("test")
def test_read_binary_iter_chunks(self): with responses.RequestsMock() as resp: def download_endpoint(request): assert request.headers.get('User-Agent') == _user_agent() return 200, {}, "abcdef" resp.add_callback(resp.GET, '{}/file_download/{}/{}/{}' .format('https://query.data.world', 'user', 'dataset', 'file.txt'), callback=download_endpoint) with RemoteFile(DefaultConfig(), "user/dataset", "file.txt", mode="rb", chunk_size=4) as reader: contents = list(reader) assert [b'abcd', b'ef'] == contents
def test_write_basic(self): with responses.RequestsMock() as resp: def upload_endpoint(request): assert "test" == ''.join([chunk.decode('utf-8') for chunk in request.body]) assert request.headers.get('User-Agent') == _user_agent() return 200, {}, json.dumps({}) resp.add_callback(resp.PUT, '{}/uploads/{}/{}/files/{}' .format('https://api.data.world/v0', 'user', 'dataset', 'file.txt'), callback=upload_endpoint) with RemoteFile(DefaultConfig(), "user/dataset", "file.txt") as writer: writer.write("test")
def test_read_jsonl(self): with responses.RequestsMock() as resp: def download_endpoint(request): assert request.headers.get('User-Agent') == _user_agent() return 200, {}, '{"A":"1", "B":"2", "C":"3"}\n' \ '{"A":"4", "B":"5", "C":"6"}\n' resp.add_callback(resp.GET, '{}/file_download/{}/{}/{}' .format('https://query.data.world', 'user', 'dataset', 'file.csv'), callback=download_endpoint) with RemoteFile(DefaultConfig(), "user/dataset", "file.csv", mode="r") as reader: rows = [json.loads(line) for line in reader if line.strip()] assert rows[0] == {'A': '1', 'B': '2', 'C': '3'} assert rows[1] == {'A': '4', 'B': '5', 'C': '6'}
def test_read_csv(self): with responses.RequestsMock() as resp: def download_endpoint(request): assert request.headers.get('User-Agent') == _user_agent() return 200, {}, "A,B,C\n1,2,3\n4,5,6" resp.add_callback(resp.GET, '{}/file_download/{}/{}/{}' .format('https://query.data.world', 'user', 'dataset', 'file.csv'), callback=download_endpoint) with RemoteFile(DefaultConfig(), "user/dataset", "file.csv", mode="r") as reader: csvr = csv.DictReader(reader) rows = list(csvr) assert rows[0] == {'A': '1', 'B': '2', 'C': '3'} assert rows[1] == {'A': '4', 'B': '5', 'C': '6'}
def test_write_csv(self): with responses.RequestsMock() as resp: def upload_endpoint(request): assert "a,b\r\n42,17\r\n420,178\r\n" == \ ''.join([chunk.decode('utf-8') for chunk in request.body]) assert request.headers.get('User-Agent') == _user_agent() return 200, {}, json.dumps({}) resp.add_callback(resp.PUT, '{}/uploads/{}/{}/files/{}' .format('https://api.data.world/v0', 'user', 'dataset', 'file.csv'), callback=upload_endpoint) with RemoteFile(DefaultConfig(), "user/dataset", "file.csv") as writer: csvw = csv.DictWriter(writer, fieldnames=['a', 'b']) csvw.writeheader() csvw.writerow({'a': 42, 'b': 17}) csvw.writerow({'a': 420, 'b': 178})
def open_remote_file(self, dataset_key, file_name, mode='w', **kwargs): """Open a remote file object that can be used to write to or read from a file in a data.world dataset :param dataset_key: Dataset identifier, in the form of owner/id :type dataset_key: str :param file_name: The name of the file to open :type file_name: str :param mode: the mode for the file - must be 'w', 'wb', 'r', or 'rb' - indicating read/write ('r'/'w') and optionally "binary" handling of the file data. (Default value = 'w') :type mode: str, optional :param chunk_size: size of chunked bytes to return when reading streamed bytes in 'rb' mode :type chunk_size: int, optional :param decode_unicode: whether to decode textual responses as unicode when returning streamed lines in 'r' mode :type decode_unicode: bool, optional :param **kwargs: Examples -------- >>> import datadotworld as dw >>> >>> # write a text file >>> with dw.open_remote_file('username/test-dataset', ... 'test.txt') as w: ... w.write("this is a test.") >>> >>> # write a jsonlines file >>> import json >>> with dw.open_remote_file('username/test-dataset', ... 'test.jsonl') as w: ... json.dump({'foo':42, 'bar':"A"}, w) ... w.write("\\n") ... json.dump({'foo':13, 'bar':"B"}, w) ... w.write("\\n") >>> >>> # write a csv file >>> import csv >>> with dw.open_remote_file('username/test-dataset', ... 'test.csv') as w: ... csvw = csv.DictWriter(w, fieldnames=['foo', 'bar']) ... csvw.writeheader() ... csvw.writerow({'foo':42, 'bar':"A"}) ... csvw.writerow({'foo':13, 'bar':"B"}) >>> >>> # write a pandas dataframe as a csv file >>> import pandas as pd >>> df = pd.DataFrame({'foo':[1,2,3,4],'bar':['a','b','c','d']}) >>> with dw.open_remote_file('username/test-dataset', ... 'dataframe.csv') as w: ... df.to_csv(w, index=False) >>> >>> # write a binary file >>> with dw.open_remote_file('username/test-dataset', >>> 'test.txt', mode='wb') as w: ... w.write(bytes([100,97,116,97,46,119,111,114,108,100])) >>> >>> # read a text file >>> with dw.open_remote_file('username/test-dataset', ... 'test.txt', mode='r') as r: ... print(r.read()) >>> >>> # read a csv file >>> with dw.open_remote_file('username/test-dataset', ... 'test.csv', mode='r') as r: ... csvr = csv.DictReader(r) ... for row in csvr: ... print(row['column a'], row['column b']) >>> >>> # read a binary file >>> with dw.open_remote_file('username/test-dataset', ... 'test', mode='rb') as r: ... bytes = r.read() """ try: return RemoteFile(self._config, dataset_key, file_name, mode=mode, **kwargs) except Exception as e: raise RestApiError(cause=e)