def _download_file(url, local_path, headers, compression): response = requests.get(url, stream=True) response.raise_for_status() # gzipped buffers can be concatenated so write headers as gzip if compression == 'gzip': with gzip.open(local_path, 'wb') as fout: fout.write(headers) with open(local_path, 'ab') as fout: shutil.copyfileobj(response.raw, fout, CHUNK_SIZE) # write headers and decompress the stream elif compression == 'none': with open(local_path, 'wb') as fout: fout.write(headers) _decompress_stream(response, fout) # decompress the stream, write headers, and zip the file elif compression == 'zip': with TemporaryDirectory() as tmp_dir: tmp_path = path.join(tmp_dir, 'civis_to_csv.csv') with open(tmp_path, 'wb') as tmp_file: tmp_file.write(headers) _decompress_stream(response, tmp_file) with zipfile.ZipFile(local_path, 'w') as fout: arcname = path.basename(local_path) if arcname.split('.')[-1] == 'zip': arcname = arcname.split('.')[0] + '.csv' fout.write(tmp_path, arcname, zipfile.ZIP_DEFLATED)
def test_download_file(self, *mocks): expected = '"1","2","3"\n' with TemporaryDirectory() as temp_dir: fname = os.path.join(temp_dir, 'tempfile') civis.io._tables._download_file(self.export_url, fname, b'', 'none') with open(fname, "r") as f: data = f.read() assert data == expected
def test_stash_local_data_from_file(mock_file): with TemporaryDirectory() as temp_dir: fname = os.path.join(temp_dir, 'tempfile') with open(fname, 'wt') as _fout: _fout.write("a,b,c\n1,2,3\n") assert _model._stash_local_file(fname) == -11 mock_file.assert_called_once_with(mock.ANY, name='modelpipeline_data.csv', client=mock.ANY)
def test_zip_member_to_civis(self, *mocks): with TemporaryDirectory() as temp_dir: fname = os.path.join(temp_dir, 'tempfile') with zipfile.ZipFile(fname, 'w', zipfile.ZIP_DEFLATED) as zip_file: archive_name = 'archive_name' zip_file.writestr(archive_name, 'a,b,c\n1,2,3') zip_member = zip_file.namelist()[0] with zip_file.open(zip_member) as zip_member_buf: result = civis.io.file_to_civis(zip_member_buf, zip_member) assert isinstance(result, int)
def test_large_file_to_civis(self, *mocks): curr_size = civis.io._files.MIN_MULTIPART_SIZE civis.io._files.MIN_MULTIPART_SIZE = 1 with TemporaryDirectory() as temp_dir: fname = os.path.join(temp_dir, 'tempfile') with open(fname, 'w+b') as tmp: tmp.write(b'a,b,c\n1,2,3') with open(fname, 'r+b') as tmp: result = civis.io.file_to_civis(tmp, fname) civis.io._files.MIN_MULTIPART_SIZE = curr_size assert isinstance(result, int)
def test_civis_to_file_local(mock_requests): # Test that a call to civis_to_file uses `requests` to grab the contents # of a URL given by the API client and writes it to a file. mock_civis = create_client_mock() mock_requests.get.return_value.iter_content.return_value =\ (l.encode() for l in 'abcdef') with TemporaryDirectory() as tdir: fname = os.path.join(tdir, 'testfile') _files.civis_to_file(137, fname, client=mock_civis) with open(fname, 'rt') as _fin: assert _fin.read() == 'abcdef' mock_civis.files.get.assert_called_once_with(137) mock_requests.get.assert_called_once_with( mock_civis.files.get.return_value.file_url, stream=True)
def test_cache_list(client_mock, civisio_mock, capsys): """Make sure the file ID cache listing works.""" with TemporaryDirectory() as dirname: test_name1 = os.path.join(dirname, 'test1.txt') with open(test_name1, 'w') as fp: fp.write('a') hash_test_name = hashlib.md5('a'.encode('utf-8')).hexdigest() test_name2 = os.path.join(dirname, 'test2.txt') with open(test_name2, 'w') as fp: fp.write('b') hash_test_name2 = hashlib.md5('b'.encode('utf-8')).hexdigest() exp_time = str(datetime.datetime.utcnow() + datetime.timedelta(days=15)) civisio_mock.file_to_civis.side_effect = [-1, -2] client_mock().files.get.return_value = {'expires_at': exp_time} # Upload some files. fc = FileIDCache(cache_file=os.path.join(dirname, 'cache.db')) fc.get_fileid(test_name1) fc.get_fileid(test_name2) # List the cahce and make sure that looks OK. fc.list() out, err = capsys.readouterr() print(out) lines = out.split('\n') assert lines[0].strip() == \ ("file id hash " "expiration date name"), ( "File cache list header is wrong!") items = lines[1].strip().split() assert items[0] == str(-1), "file id is wrong for cache listing!" assert items[1] == hash_test_name, "hash is wrong for cache listing!" assert ' '.join(items[2:4]) == exp_time, ( "expiration time is wrong for cache listing!") assert items[4] == test_name1, "name is wrong for cache listing!" items = lines[2].strip().split() assert items[0] == str(-2), "file id is wrong for cache listing!" assert items[1] == hash_test_name2, "hash is wrong for cache listing!" assert ' '.join(items[2:4]) == exp_time, ( "expiration time is wrong for cache listing!") assert items[4] == test_name2, "name is wrong for cache listing!"
def test_file_to_civis(mock_requests, input_filename): # Test that file_to_civis posts a Civis File with the API client # and calls `requests.post` on the returned URL. mock_civis = create_client_mock() civis_name, expected_id = 'newname', 137 mock_civis.files.post.return_value.id = expected_id with TemporaryDirectory() as tdir: fname = os.path.join(tdir, 'newname') with open(fname, 'wt') as _fout: _fout.write('abcdef') fid = _files.file_to_civis(fname, input_filename, expires_at=None, client=mock_civis) assert fid == expected_id mock_civis.files.post.assert_called_once_with(civis_name, expires_at=None) mock_requests.post.assert_called_once_with( mock_civis.files.post.return_value.upload_url, files=mock.ANY)
def test_cache(client_mock, civisio_mock, day_offset): """Make sure the file ID cache works.""" with TemporaryDirectory() as dirname: test_name1 = os.path.join(dirname, 'test1.txt') with open(test_name1, 'w') as fp: fp.write('a') test_name2 = os.path.join(dirname, 'test2.txt') with open(test_name2, 'w') as fp: fp.write('b') exp_time = str(datetime.datetime.utcnow() + datetime.timedelta(days=day_offset)) exp_time30 = str(datetime.datetime.utcnow() + datetime.timedelta(days=30)) civisio_mock.file_to_civis.side_effect = [-1, -2, -3] client_mock().files.get.return_value = {'expires_at': exp_time} # Upload once. fc = FileIDCache(cache_file=os.path.join(dirname, 'cache.db')) fid = fc.get_fileid(test_name1) assert fid == -1, "File ID for cache insert is wrong!" client_mock().files.get.assert_called_with(-1) assert civisio_mock.file_to_civis.call_count == 1 fid = fc.get_fileid(test_name2) assert fid == -2, "File ID for cache insert is wrong!" client_mock().files.get.assert_called_with(-2) assert civisio_mock.file_to_civis.call_count == 2 # Get it back out. client_mock.reset_mock() if day_offset < 14: client_mock().files.get.return_value = {'expires_at': exp_time30} fid = fc.get_fileid(test_name1) # If it is going to expire, make sure we uploaded it again if day_offset >= 14: assert fid == -1, "File ID for cache get is wrong!" assert civisio_mock.file_to_civis.call_count == 2 assert client_mock().files.get.call_count == 0 else: assert fid == -3, "File ID for cache get is wrong!" assert civisio_mock.file_to_civis.call_count == 3 client_mock().files.get.assert_called_with(-3)
def test_csv_to_civis(self, *mocks): with TemporaryDirectory() as temp_dir: fname = os.path.join(temp_dir, 'tempfile') with open(fname, 'w+b') as tmp: tmp.write(b'a,b,c\n1,2,3') table = "scratch.api_client_test_fixture" database = 'redshift-general' result = civis.io.csv_to_civis(fname, database, table, existing_table_rows='truncate', polling_interval=POLL_INTERVAL) result = result.result() # block until done assert isinstance(result.id, int) assert result.state == 'succeeded'
def test_reformat_notebook(example_notebook): with TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, 'script.ipynb'), 'w') as fp: fp.write(example_notebook) code = _reformat_notebook(os.path.join(tmpdir, 'script.ipynb')) assert code == """ # coding: utf-8 # In[ ]: import sys pass pass """, "Notebook was not reformatted correctly!"
def dataframe_to_file(df, name='data.csv', expires_at='DEFAULT', client=None, **to_csv_kws): """Store a :class:`~pandas.DataFrame` as a CSV in Civis Platform Parameters ---------- df : :class:`~pandas.DataFrame` The table to upload. name : str, optional The name of the Civis File expires_at : str, optional The date and time the file will expire. If not specified, the file will expire in 30 days. To keep a file indefinitely, specify null. If provided, this must be either `None` or a valid RFC3339 date/Time string. client : :class:`civis.APIClient`, optional If not provided, an :class:`civis.APIClient` object will be created from the :envvar:`CIVIS_API_KEY`. **to_csv_kws Additional keyword parameters will be passed directly to :func:`~pandas.DataFrame.to_csv`. Returns ------- file_id : int The integer ID of the new Civis File object See Also -------- :func:`file_to_civis` :func:`~pandas.DataFrame.to_csv` """ with TemporaryDirectory() as tdir: path = os.path.join(tdir, name) df.to_csv(path, **to_csv_kws) file_kwargs = {'name': name} if expires_at != 'DEFAULT': # A missing parameter signifies the default value here. file_kwargs['expires_at'] = expires_at fid = file_to_civis(path, client=client, **file_kwargs) return fid
def setUpClass(cls, *mocks): get_api_spec.cache_clear() generate_classes.cache_clear() setup_vcr = vcr.VCR(filter_headers=['Authorization']) setup_cassette = os.path.join(cassette_dir(), 'io_setup.yml') with setup_vcr.use_cassette(setup_cassette): # create a file buf = StringIO() buf.write('a,b,c\n1,2,3') buf.seek(0) file_id = civis.io.file_to_civis(buf, 'somename') cls.file_id = file_id # create the table. assumes this function works. sql = """ DROP TABLE IF EXISTS scratch.api_client_test_fixture; CREATE TABLE scratch.api_client_test_fixture ( a int, b int, c int ); INSERT INTO scratch.api_client_test_fixture VALUES (1,2,3); """ res = civis.io.query_civis(sql, 'redshift-general', polling_interval=POLL_INTERVAL) res.result() # block # create an export to check get_url. also tests export_csv with TemporaryDirectory() as temp_dir: fname = os.path.join(temp_dir, 'tempfile') sql = "SELECT * FROM scratch.api_client_test_fixture" database = 'redshift-general' result = civis.io.civis_to_csv(fname, sql, database, polling_interval=POLL_INTERVAL) result = result.result() cls.export_url = result['output'][0]['path'] assert result.state == 'succeeded' cls.export_job_id = result.sql_id
def _sub(script, cli_args, cli_params): """Driver for the CLI submit command""" client = civis.APIClient(resources='all') dry_run = cli_params.pop('dry_run') if os.path.isfile(script) and script.endswith('.ipynb'): # Make notebooks python scripts in temp dir with TemporaryDirectory() as tmpdir: new_script = os.path.join(tmpdir, os.path.basename(script)) with open(new_script, 'w') as fp: fp.write(_reformat_notebook(script)) jobid = _sub_job(new_script, script, cli_args, cli_params, client, dry_run) else: jobid = _sub_job(script, script, cli_args, cli_params, client, dry_run) if jobid: print(jobid) # If we get here, stuff the script ID in the user's home # area for later. _record_last_jobid(jobid)
def apply_async(self, func, callback=None): """Schedule func to be run """ # Serialize func to a temporary file and upload it to a Civis File. # Make the temporary files expire in a week. expires_at = (datetime.now() + timedelta(days=7)).isoformat() with TemporaryDirectory() as tempdir: temppath = os.path.join(tempdir, "civis_joblib_backend_func") with open(temppath, "wb") as tmpfile: cloudpickle.dump((func, self if self.remote_backend == 'civis' else self.remote_backend), tmpfile, pickle.HIGHEST_PROTOCOL) with open(temppath, "rb") as tmpfile: func_file_id = \ _robust_file_to_civis(tmpfile, "civis_joblib_backend_func", n_retries=5, delay=0.5, expires_at=expires_at, client=self.client) log.debug("uploaded serialized function to File: %d", func_file_id) # Use the Civis CLI client to download the job runner script into # the container, and then run it on the uploaded job. # Only download the runner script if it doesn't already # exist in the destination environment. runner_remote_path = "civis_joblib_worker" cmd = ("{setup_cmd} && " "if command -v {runner_remote_path} >/dev/null; " "then exec {runner_remote_path} {func_file_id}; " "else pip install civis=={civis_version} && " "exec {runner_remote_path} {func_file_id}; fi ".format( civis_version=civis.__version__, runner_remote_path=runner_remote_path, func_file_id=func_file_id, setup_cmd=self.setup_cmd)) # Try to submit the command, with optional retrying for certain # error types. for n_retries in range(1 + self.max_submit_retries): try: if self.using_template: args = {'JOBLIB_FUNC_FILE_ID': func_file_id} future = self.executor.submit(**args) log.debug( "Started custom script from template " "%s with arguments %s", self.executor.from_template_id, args) else: future = self.executor.submit(fn=cmd) log.debug( "started container script with " "command: %s", cmd) # Stop retrying if submission was successful. break except CivisAPIError as e: # If we've retried the maximum number of times already, # then raise an exception. retries_left = self.max_submit_retries - n_retries - 1 if retries_left < 1: raise JobSubmissionError(e) log.debug("Retrying submission. %d retries left", retries_left) # Sleep with exponentially increasing intervals in case # the issue persists for a while. time.sleep(2**n_retries) if self.executor.max_n_retries: # Start the ContainerFuture polling. # This will use more API calls, but will # allow the ContainerFuture to launch # retries if necessary. # (This is only relevant if we're not using the # notifications endpoint.) future.done() result = _CivisBackendResult(future, callback) return result
def file_to_civis(buf, name=None, api_key=None, client=None, **kwargs): """Upload a file to Civis. Parameters ---------- buf : file-like object or str The file or other buffer that you wish to upload. Strings will be treated as paths to local files to open. name : str, optional The name you wish to give the file. If not given, it will be inferred from the basename of ``buf`` (if ``buf`` is a string for a file path) or ``buf.name`` (if ``buf`` is a file-like object). api_key : DEPRECATED str, optional Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY` environment variable will be used. client : :class:`civis.APIClient`, optional If not provided, an :class:`civis.APIClient` object will be created from the :envvar:`CIVIS_API_KEY`. **kwargs : kwargs Extra keyword arguments will be passed to the file creation endpoint. See :func:`~civis.resources._resources.Files.post`. Returns ------- file_id : int The new Civis file ID. Raises ------ TypeError If ``name`` is not provided and cannot be inferred from ``buf`` Examples -------- >>> # Upload file at a given path on the local filesystem. >>> file_id = file_to_civis("my_data.csv", 'my_data') >>> # If not given, ``name`` will be the basename of the given file path. >>> file_id = file_to_civis("foo/bar/data.csv") # ``name`` is 'data.csv' >>> # Upload file which expires in 30 days >>> with open("my_data.csv", "r") as f: ... file_id = file_to_civis(f, 'my_data') >>> # Upload file which never expires >>> with open("my_data.csv", "r") as f: ... file_id = file_to_civis(f, 'my_data', expires_at=None) Notes ----- If you are opening a binary file (e.g., a compressed archive) to pass to this function, do so using the ``'rb'`` (read binary) mode (e.g., ``open('myfile.zip', 'rb')``). Warning: If the file-like object is seekable, the current position will be reset to 0. This facilitates retries and is used to chunk files for multipart uploads for improved performance. Small or non-seekable file-like objects will be uploaded with a single post. """ if name is None: if isinstance(buf, six.string_types): name = os.path.basename(buf) elif hasattr(buf, 'name'): name = buf.name else: msg = ( "`buf` is a file-like object, but its name cannot be inferred." " Please provide `name` explicitly.") raise TypeError(msg) if isinstance(buf, six.string_types): with open(buf, 'rb') as f: return _file_to_civis(f, name, api_key=api_key, client=client, **kwargs) # we should only pass _file_to_civis a file-like object that is # on disk, seekable and at position 0 if not isinstance(buf, (io.BufferedReader, io.TextIOWrapper)) or \ buf.tell() != 0: # determine mode for writing mode = 'w' if isinstance(buf.read(0), six.binary_type): mode += 'b' with TemporaryDirectory() as tmp_dir: tmp_path = os.path.join(tmp_dir, 'file_to_civis.csv') with open(tmp_path, mode) as fout: shutil.copyfileobj(buf, fout, CHUNK_SIZE) with open(tmp_path, 'rb') as fin: return _file_to_civis(fin, name, api_key=api_key, client=client, **kwargs) else: return _file_to_civis(buf, name, api_key=api_key, client=client, **kwargs)
def cleandir(): cwd = os.getcwd() with TemporaryDirectory() as tmpdir: os.chdir(tmpdir) yield tmpdir os.chdir(cwd)
def _multipart_upload(buf, name, file_size, client, **kwargs): # scale the part size based on file size part_size = max(int(math.sqrt(MIN_PART_SIZE) * math.sqrt(file_size)), MIN_PART_SIZE) num_parts = int(math.ceil((file_size) / float(part_size))) log.debug( 'Uploading file with %s bytes using %s file parts with a part ' 'size of %s bytes', file_size, part_size, num_parts) file_response = client.files.post_multipart(name=name, num_parts=num_parts, **kwargs) # Platform will give us a URL for each file part urls = file_response.upload_urls assert num_parts == len(urls), \ "There are {} file parts but only {} urls".format(num_parts, len(urls)) # generate for writing a specific number of bytes from the buffer def _gen_chunks(part_buf, max_bytes, chunk_size=32 * 1024): bytes_read = 0 while True: length = min(chunk_size, max_bytes - bytes_read) if length <= 0: break bytes_read += length data = part_buf.read(length) yield data # upload function wrapped with a retry decorator @retry(RETRY_EXCEPTIONS) def _upload_part_base(item, part_path): part_num, part_url = item[0], item[1] log.debug('Uploading file part %s', part_num) file_out = part_path.format(part_num) with open(file_out, 'rb') as fout: part_response = requests.put(part_url, data=fout) if not part_response.ok: msg = _get_aws_error_message(part_response) raise HTTPError(msg, response=part_response) log.debug('Completed upload of file part', part_num) # upload each part try: pool = Pool(MAX_THREADS) with TemporaryDirectory() as tmp_dir: tmp_path = os.path.join(tmp_dir, 'file_to_civis_{}.csv') for i in range(0, num_parts): offset = part_size * i num_bytes = min(part_size, file_size - offset) buf.seek(offset) # write part to disk so that we can stream it file_in = tmp_path.format(i) with open(file_in, 'wb') as fin: for x in _gen_chunks(buf, num_bytes): fin.write(x) _upload_part = partial(_upload_part_base, part_path=tmp_path) pool.map(_upload_part, enumerate(urls)) # complete the multipart upload; an abort will be triggered # if any part except the last failed to upload at least 5MB finally: pool.terminate() client.files.post_multipart_complete(file_response.id) return file_response.id
def dataframe_to_civis(df, database, table, api_key=None, client=None, max_errors=None, existing_table_rows="fail", diststyle=None, distkey=None, sortkey1=None, sortkey2=None, headers=None, credential_id=None, polling_interval=None, archive=False, hidden=True, **kwargs): """Upload a `pandas` `DataFrame` into a Civis table. The `DataFrame`'s index will not be included. To store the index along with the other values, use `df.reset_index()` instead of `df` as the first argument to this function. Parameters ---------- df : :class:`pandas:pandas.DataFrame` The `DataFrame` to upload to Civis. database : str or int Upload data into this database. Can be the database name or ID. table : str The schema and table you want to upload to. E.g., ``'scratch.table'``. api_key : DEPRECATED str, optional Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY` environment variable will be used. client : :class:`civis.APIClient`, optional If not provided, an :class:`civis.APIClient` object will be created from the :envvar:`CIVIS_API_KEY`. max_errors : int, optional The maximum number of rows with errors to remove from the import before failing. existing_table_rows : str, optional The behaviour if a table with the requested name already exists. One of ``'fail'``, ``'truncate'``, ``'append'`` or ``'drop'``. Defaults to ``'fail'``. diststyle : str, optional The distribution style for the table. One of ``'even'``, ``'all'`` or ``'key'``. distkey : str, optional The column to use as the distkey for the table. sortkey1 : str, optional The column to use as the sortkey for the table. sortkey2 : str, optional The second column in a compound sortkey for the table. headers : bool, optional Whether or not the first row of the file should be treated as headers. The default, ``None``, attempts to autodetect whether or not the first row contains headers. credential_id : str or int, optional The ID of the database credential. If ``None``, the default credential will be used. polling_interval : int or float, optional Number of seconds to wait between checks for job completion. archive : bool, optional (deprecated) If ``True``, archive the import job as soon as it completes. hidden : bool, optional If ``True`` (the default), this job will not appear in the Civis UI. **kwargs : kwargs Extra keyword arguments will be passed to :meth:`pandas:pandas.DataFrame.to_csv`. Returns ------- fut : :class:`~civis.futures.CivisFuture` A `CivisFuture` object. Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) >>> fut = civis.io.dataframe_to_civis(df, 'my-database', ... 'scratch.df_table') >>> fut.result() """ if client is None: client = APIClient(api_key=api_key, resources='all') if archive: warnings.warn( "`archive` is deprecated and will be removed in v2.0.0. " "Use `hidden` instead.", FutureWarning) with TemporaryDirectory() as tmp_dir: tmp_path = os.path.join(tmp_dir, 'dataframe_to_civis.csv') df.to_csv(tmp_path, encoding='utf-8', index=False, **kwargs) name = table.split('.')[-1] file_id = file_to_civis(tmp_path, name, client=client) delimiter = ',' fut = civis_file_to_table(file_id, database, table, client=client, max_errors=max_errors, existing_table_rows=existing_table_rows, diststyle=diststyle, distkey=distkey, sortkey1=sortkey1, sortkey2=sortkey2, delimiter=delimiter, headers=headers, credential_id=credential_id, polling_interval=polling_interval, hidden=hidden) return fut