def _import_bytes(buf, database, table, client, max_errors, existing_table_rows, distkey, sortkey1, sortkey2, delimiter, headers, credential_id, polling_interval, archive, hidden): schema, table = table.split(".", 1) db_id = client.get_database_id(database) cred_id = credential_id or client.default_credential delimiter = DELIMITERS.get(delimiter) assert delimiter, "delimiter must be one of {}".format(DELIMITERS.keys()) kwargs = dict(schema=schema, name=table, remote_host_id=db_id, credential_id=cred_id, max_errors=max_errors, existing_table_rows=existing_table_rows, distkey=distkey, sortkey1=sortkey1, sortkey2=sortkey2, column_delimiter=delimiter, first_row_is_header=headers, hidden=hidden) import_job = client.imports.post_files(**kwargs) put_response = requests.put(import_job.upload_uri, buf) put_response.raise_for_status() run_job_result = client._session.post(import_job.run_uri) run_job_result.raise_for_status() run_info = run_job_result.json() fut = CivisFuture(client.imports.get_files_runs, (run_info['importId'], run_info['id']), polling_interval=polling_interval, client=client, poll_on_creation=False) if archive: def f(x): return client.imports.put_archive(import_job.id, True) fut.add_done_callback(f) return fut
def test_outputs_succeeded(self): poller = _create_poller_mock("succeeded") mock_client = create_client_mock() expected_return = [{'test': 'test_result'}] mock_client.jobs.list_runs_outputs.return_value = expected_return result = CivisFuture(poller, (1, 2), client=mock_client) assert result.outputs() == expected_return
def test_outputs_succeeded(self): poller = mock.Mock() api_result = mock.Mock() api_result.state = 'succeeded' mock_client = create_client_mock() expected_return = [{'test': 'test_result'}] mock_client.jobs.list_runs_outputs.return_value = expected_return result = CivisFuture(poller, (1, 2), client=mock_client) result._set_api_result(api_result) assert result.outputs() == expected_return
def test_check_message(self, *mocks): result = CivisFuture(lambda x: x, (1, 20)) message = { 'object': { 'id': 1 }, 'run': { 'id': 20, 'state': 'succeeded' } } self.assertTrue(result._check_message(message))
def test_check_message_with_different_run_id(self, *mocks): result = CivisFuture(lambda x: x, (1, 20)) message = { 'object': { 'id': 2 }, 'run': { 'id': 20, 'state': 'succeeded' } } self.assertFalse(result._check_message(message))
def test_check_message_when_job_is_running(self, *mocks): result = CivisFuture(lambda x: x, (1, 20)) message = { 'object': { 'id': 1 }, 'run': { 'id': 20, 'state': 'running' } } self.assertFalse(result._check_message(message))
def run_template(id, arguments, JSONValue=False, client=None): """Run a template and return the results. Parameters ---------- id: int The template id to be run. arguments: dict Dictionary of arguments to be passed to the template. JSONValue: bool, optional If True, will return the JSON output of the template. If False, will return the file ids associated with the output results. client: :class:`civis.APIClient`, optional If not provided, an :class:`civis.APIClient` object will be created from the :envvar:`CIVIS_API_KEY`. Returns ------- output: dict If JSONValue = False, dictionary of file ids with the keys being their output names. If JSONValue = True, JSON dict containing the results of the template run. Expects only a single JSON result. Will return nothing if either there is no JSON result or there is more than 1 JSON result. """ if client is None: client = APIClient() job = client.scripts.post_custom(id, arguments=arguments) run = client.scripts.post_custom_runs(job.id) fut = CivisFuture(client.scripts.get_custom_runs, (job.id, run.id), client=client) fut.result() outputs = client.scripts.list_custom_runs_outputs(job.id, run.id) if JSONValue: json_output = [ o.value for o in outputs if o.object_type == "JSONValue" ] if len(json_output) == 0: log.warning("No JSON output for template {}".format(id)) return if len(json_output) > 1: log.warning("More than 1 JSON output for template {}" " -- returning only the first one.".format(id)) # Note that the cast to a dict is to convert # an expected Response object. return dict(json_output[0]) else: file_ids = {o.name: o.object_id for o in outputs} return file_ids
def test_set_api_result_result_succeeded(self, mock_subscribe, mock_api): mock_pubnub = mock.Mock() mock_pubnub.unsubscribe_all.return_value = None mock_subscribe.return_value = mock_pubnub poller = mock.Mock() api_result = mock.Mock() api_result.state = 'succeeded' result = CivisFuture(poller, (1, 2)) result._set_api_result(api_result) assert poller.call_count == 0 assert mock_pubnub.unsubscribe_all.call_count == 1 assert result._state == 'FINISHED'
def run_job(job_id, api_key=None, client=None, polling_interval=None): """Run a job. Parameters ---------- job_id: str or int The ID of the job. api_key: DEPRECATED str, optional Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY` environment variable will be used. client: :class:`civis.APIClient`, optional If not provided, an :class:`civis.APIClient` object will be created from the :envvar:`CIVIS_API_KEY`. polling_interval : int or float, optional The number of seconds between API requests to check whether a result is ready. Returns ------- results: :class:`~civis.futures.CivisFuture` A `CivisFuture` object. """ if client is None: client = APIClient(api_key=api_key) run = client.jobs.post_runs(job_id) return CivisFuture( client.jobs.get_runs, (job_id, run["id"]), client=client, polling_interval=polling_interval, poll_on_creation=False, )
def query_civis(sql, database, api_key=None, client=None, credential_id=None, preview_rows=10, polling_interval=None, hidden=True): """Execute a SQL statement as a Civis query. Run a query that may return no results or where only a small preview is required. To execute a query that returns a large number of rows, see :func:`~civis.io.read_civis_sql`. Parameters ---------- sql : str The SQL statement to execute. database : str or int The name or ID of the database. api_key : DEPRECATED str, optional Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY` environment variable will be used. client : :class:`civis.APIClient`, optional If not provided, an :class:`civis.APIClient` object will be created from the :envvar:`CIVIS_API_KEY`. credential_id : str or int, optional The ID of the database credential. If ``None``, the default credential will be used. preview_rows : int, optional The maximum number of rows to return. No more than 100 rows can be returned at once. polling_interval : int or float, optional Number of seconds to wait between checks for query completion. hidden : bool, optional If ``True`` (the default), this job will not appear in the Civis UI. Returns ------- results : :class:`~civis.futures.CivisFuture` A `CivisFuture` object. Examples -------- >>> run = query_civis(sql="DELETE schema.table", database='database') >>> run.result() # Wait for query to complete """ if client is None: client = APIClient(api_key=api_key) database_id = client.get_database_id(database) cred_id = credential_id or client.default_credential resp = client.queries.post(database_id, sql, preview_rows, credential=cred_id, hidden=hidden) return CivisFuture(client.queries.get, (resp.id, ), polling_interval, client=client, poll_on_creation=False)
def run_job(job_id, api_key=None, client=None): """Run a job. Parameters ---------- job_id : str or int The ID of the job. api_key : DEPRECATED str, optional Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY` environment variable will be used. client : :class:`civis.APIClient`, optional If not provided, an :class:`civis.APIClient` object will be created from the :envvar:`CIVIS_API_KEY`. Returns ------- results : :class:`~civis.futures.CivisFuture` A `CivisFuture` object. """ if client is None: client = APIClient(api_key=api_key) run = client.jobs.post_runs(job_id) return CivisFuture(client.jobs.get_runs, (job_id, run['id']), client=client, poll_on_creation=False)
def test_future_job_id_run_id(poller_args, expected_job_id, expected_run_id): result = CivisFuture( poller=_create_poller_mock("succeeded"), poller_args=poller_args, client=create_client_mock(), ) assert result.job_id == expected_job_id assert result.run_id == expected_run_id
def test_future_job_id_run_id(poller_args, expected_job_id, expected_run_id): result = CivisFuture( poller=lambda x: x, poller_args=poller_args, client=create_client_mock(), ) assert result.job_id == expected_job_id assert result.run_id == expected_run_id
def test_polling_interval(self, *mocks): clear_lru_cache() polling_interval = 30 future = CivisFuture(lambda x: x, (1, 20), polling_interval=polling_interval) assert future.polling_interval == polling_interval clear_lru_cache()
def test_polling_interval(self, *mocks): # This tests the fallback to polling when channels is not available. # It uses a different api spec than the other tests so it # should clear the cached values before and after clear_lru_cache() polling_interval = 30 future = CivisFuture(lambda x: x, (1, 20), polling_interval=polling_interval) assert future.polling_interval == polling_interval assert hasattr(future, '_pubnub') is False clear_lru_cache()
def test_set_api_result_failed(self, mock_subscribe, mock_api): mock_pubnub = mock.Mock() mock_pubnub.unsubscribe_all.return_value = None mock_subscribe.return_value = mock_pubnub poller = mock.Mock() api_result = mock.Mock() api_result.state = 'failed' result = CivisFuture(poller, (1, 2)) result._set_api_result(api_result) assert mock_pubnub.unsubscribe_all.call_count == 1 assert result._state == 'FINISHED' with pytest.raises(CivisJobFailure): result.result() with pytest.raises(CivisJobFailure): result.outputs()
def test_set_api_result_failed(self, mock_api, m_sleep): poller = _create_poller_mock("failed") result = CivisFuture(poller, (1, 2)) assert result._state == 'FINISHED' with pytest.raises(CivisJobFailure): result.result() with pytest.raises(CivisJobFailure): result.outputs()
def run_job(job_id, api_key=None): """Run a job. Parameters ---------- job_id : str or int The ID of the job. api_key : str, optional Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY` environment variable will be used. Returns ------- results : :class:`~civis.futures.CivisFuture` A `CivisFuture` object. """ client = APIClient(api_key=api_key, resources='all') run = client.jobs.post_runs(job_id) return CivisFuture(client.jobs.get_runs, (job_id, run['id']), api_key=api_key, poll_on_creation=False)
def civis_to_csv(filename, sql, database, job_name=None, api_key=None, credential_id=None, archive=False, hidden=True, polling_interval=None): """Export data from Civis to a local CSV file. Parameters ---------- filename : str Download exported data into this file. sql : str, optional The SQL select string to be executed. database : str or int Export data from this database. Can be the database name or ID. job_name : str, optional A name to give the job. If omitted, a random job name will be used. api_key : str, optional Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY` environment variable will be used. credential_id : str or int, optional The ID of the database credential. If ``None``, the default credential will be used. polling_interval : int or float, optional Number of seconds to wait between checks for query completion. archive : bool, optional (deprecated) If ``True``, archive the import job as soon as it completes. hidden : bool, optional If ``True`` (the default), this job will not appear in the Civis UI. Returns ------- results : :class:`~civis.futures.CivisFuture` A `CivisFuture` object. Examples -------- >>> sql = "SELECT * FROM schema.table" >>> fut = civis_to_csv("file.csv", sql, "my_database") >>> fut.result() # Wait for job to complete See Also -------- civis.io.read_civis : Read table contents into memory. civis.io.read_civis_sql : Read results of a SQL query into memory. """ if archive: warnings.warn( "`archive` is deprecated and will be removed in v2.0.0. " "Use `hidden` instead.", FutureWarning) client = APIClient(api_key=api_key) script_id, run_id = _sql_script(client, sql, database, job_name, credential_id, hidden=hidden) fut = CivisFuture(client.scripts.get_sql_runs, (script_id, run_id), polling_interval=polling_interval, api_key=api_key, poll_on_creation=False) download = _download_callback(script_id, run_id, client, filename) fut.add_done_callback(download) if archive: def f(x): return client.scripts.put_sql_archive(script_id, True) fut.add_done_callback(f) return fut
def civis_to_multifile_csv(sql, database, job_name=None, api_key=None, client=None, credential_id=None, include_header=True, compression='none', delimiter='|', unquoted=False, prefix=None, polling_interval=None, hidden=True): """Unload the result of SQL query and return presigned urls. This function is intended for unloading large queries/tables from redshift as it uses a 'PARALLEL ON' S3 unload. It returns a similar manifest file to conventional S3 UNLOAD statements except the CSV parts are accessible via both files endpoint IDs and presigned S3 urls. Parameters ---------- sql : str, optional The SQL select string to be executed. database : str or int Execute the query against this database. Can be the database name or ID. job_name : str, optional A name to give the job. If omitted, a random job name will be used. api_key : DEPRECATED str, optional Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY` environment variable will be used. client : :class:`civis.APIClient`, optional If not provided, an :class:`civis.APIClient` object will be created from the :envvar:`CIVIS_API_KEY`. credential_id : str or int, optional The database credential ID. If ``None``, the default credential will be used. include_header: bool, optional If ``True`` include a key in the returned dictionary containing a list of column names. Default: ``True``. compression: str, optional Type of compression to use, if any. One of ``'none'``, ``'zip'``, or ``'gzip'``. Default ``'none'``. delimiter, str: optional Which delimiter to use, if any. One of ``','``, ``'\t'``, or ``'|'``. Default: ``'|'``. unquoted: bool, optional Whether or not to quote fields. Default: ``False``. prefix: str, optional A user specified filename prefix for the output file to have. Default: ``None``. polling_interval : int or float, optional Number of seconds to wait between checks for query completion. hidden : bool, optional If ``True`` (the default), this job will not appear in the Civis UI. Returns ------- unload_manifest: dict A dictionary resembling an AWS manifest file. Has the following keys: ``'header'``, ``'query'``, ``'entries'``, respresenting the columns from the query, the query itself, and a list of dictionaries for each unloaded CSV part, each containing its file ``'id'``, ``'name'``, ``'size'``, and unsigned and signed S3 urls, ``'url'`` and ``'url_signed'``, respectively. Examples -------- >>> sql = "SELECT * FROM schema.my_big_table" >>> database = "my_database" >>> delimiter = "|" >>> manifest = civis_multipart_unload(sql, database, delimiter=delimiter) >>> ids = [file['id'] for file in manifest['files']] >>> buf = BytesIO() >>> civis_to_file(ids[0], buf) >>> buf.seek(0) >>> df = pd.read_csv(buf, delimiter=delimiter) See Also -------- civis.APIClient.scripts.post_sql """ if client is None: client = APIClient(api_key=api_key, resources='all') delimiter = DELIMITERS.get(delimiter) assert delimiter, "delimiter must be one of {}".format(DELIMITERS.keys()) csv_settings = dict(include_header=include_header, compression=compression, column_delimiter=delimiter, unquoted=unquoted, filename_prefix=prefix, force_multifile=True) script_id, run_id = _sql_script(client, sql, database, job_name, credential_id, hidden, csv_settings=csv_settings) fut = CivisFuture(client.scripts.get_sql_runs, (script_id, run_id), polling_interval=polling_interval, client=client, poll_on_creation=False) outputs = fut.result()["output"] if not outputs: raise EmptyResultError("Unload query {} returned no manifest." .format(script_id)) buf = io.BytesIO() civis_to_file(outputs[0]['file_id'], buf) txt = io.TextIOWrapper(buf, encoding='utf-8') txt.seek(0) unload_manifest = json.load(txt) return unload_manifest
def civis_to_csv(filename, sql, database, job_name=None, api_key=None, client=None, credential_id=None, include_header=True, compression='none', delimiter=',', unquoted=False, archive=False, hidden=True, polling_interval=None): """Export data from Civis to a local CSV file. Parameters ---------- filename : str Download exported data into this file. sql : str, optional The SQL select string to be executed. database : str or int Export data from this database. Can be the database name or ID. job_name : str, optional A name to give the job. If omitted, a random job name will be used. api_key : DEPRECATED str, optional Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY` environment variable will be used. client : :class:`civis.APIClient`, optional If not provided, an :class:`civis.APIClient` object will be created from the :envvar:`CIVIS_API_KEY`. credential_id : str or int, optional The ID of the database credential. If ``None``, the default credential will be used. include_header: bool, optional If ``True``, the first line of the CSV will be headers. Default: ``True``. compression: str, optional Type of compression to use, if any. One of ``'none'``, ``'zip'``, or ``'gzip'``. Default ``'none'``. delimiter, str: optional Which delimiter to use, if any. One of ``','``, ``'\t'``, or ``'|'``. Default: ``','``. unquoted: bool, optional Whether or not to quote fields. Default: ``False``. polling_interval : int or float, optional Number of seconds to wait between checks for query completion. archive : bool, optional (deprecated) If ``True``, archive the import job as soon as it completes. hidden : bool, optional If ``True`` (the default), this job will not appear in the Civis UI. Returns ------- results : :class:`~civis.futures.CivisFuture` A `CivisFuture` object. Examples -------- >>> sql = "SELECT * FROM schema.table" >>> fut = civis_to_csv("file.csv", sql, "my_database") >>> fut.result() # Wait for job to complete See Also -------- civis.io.read_civis : Read table contents into memory. civis.io.read_civis_sql : Read results of a SQL query into memory. """ if archive: warnings.warn("`archive` is deprecated and will be removed in v2.0.0. " "Use `hidden` instead.", FutureWarning) if client is None: client = APIClient(api_key=api_key, resources='all') delimiter = DELIMITERS.get(delimiter) if not delimiter: raise ValueError("delimiter must be one of {}" .format(DELIMITERS.keys())) csv_settings = dict(include_header=include_header, compression=compression, column_delimiter=delimiter, unquoted=unquoted, filename_prefix=None, force_multifile=False) script_id, run_id = _sql_script(client, sql, database, job_name, credential_id, hidden=hidden, csv_settings=csv_settings) fut = CivisFuture(client.scripts.get_sql_runs, (script_id, run_id), polling_interval=polling_interval, client=client, poll_on_creation=False) download = _download_callback(script_id, run_id, client, filename) fut.add_done_callback(download) if archive: def f(x): return client.scripts.put_sql_archive(script_id, True) fut.add_done_callback(f) return fut
def read_civis_sql(sql, database, use_pandas=False, job_name=None, api_key=None, client=None, credential_id=None, polling_interval=None, archive=False, hidden=True, **kwargs): """Read data from Civis using a custom SQL string. Parameters ---------- sql : str, optional The SQL select string to be executed. database : str or int Execute the query against this database. Can be the database name or ID. use_pandas : bool, optional If ``True``, return a :class:`pandas:pandas.DataFrame`. Otherwise, return a list of results from :func:`python:csv.reader`. job_name : str, optional A name to give the job. If omitted, a random job name will be used. api_key : DEPRECATED str, optional Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY` environment variable will be used. client : :class:`civis.APIClient`, optional If not provided, an :class:`civis.APIClient` object will be created from the :envvar:`CIVIS_API_KEY`. credential_id : str or int, optional The database credential ID. If ``None``, the default credential will be used. polling_interval : int or float, optional Number of seconds to wait between checks for query completion. archive : bool, optional (deprecated) If ``True``, archive the import job as soon as it completes. hidden : bool, optional If ``True`` (the default), this job will not appear in the Civis UI. **kwargs : kwargs Extra keyword arguments are passed into :func:`pandas:pandas.read_csv` if `use_pandas` is ``True`` or passed into :func:`python:csv.reader` if `use_pandas` is ``False``. Returns ------- data : :class:`pandas:pandas.DataFrame` or list A list of rows (with header as first row) if `use_pandas` is ``False``, otherwise a `pandas` `DataFrame`. Note that if `use_pandas` is ``False``, no parsing of types is performed and each row will be a list of strings. Raises ------ ImportError If `use_pandas` is ``True`` and `pandas` is not installed. Examples -------- >>> sql = "SELECT * FROM schema.table" >>> df = read_civis_sql(sql, "my_database", use_pandas=True) >>> col_a = df["column_a"] >>> data = read_civis_sql(sql, "my_database") >>> columns = data.pop(0) >>> col_a_index = columns.index("column_a") >>> col_a = [row[col_a_index] for row in data] Notes ----- This reads the data into memory. See Also -------- civis.io.read_civis : Read directly into memory without SQL. civis.io.civis_to_csv : Write directly to a CSV file. """ if client is None: client = APIClient(api_key=api_key, resources='all') if use_pandas and NO_PANDAS: raise ImportError("use_pandas is True but pandas is not installed.") if archive: warnings.warn("`archive` is deprecated and will be removed in v2.0.0. " "Use `hidden` instead.", FutureWarning) script_id, run_id = _sql_script(client, sql, database, job_name, credential_id, hidden=hidden) fut = CivisFuture(client.scripts.get_sql_runs, (script_id, run_id), polling_interval=polling_interval, client=client, poll_on_creation=False) if archive: def f(x): return client.scripts.put_sql_archive(script_id, True) fut.add_done_callback(f) fut.result() outputs = client.scripts.get_sql_runs(script_id, run_id)["output"] if not outputs: raise EmptyResultError("Query {} returned no output." .format(script_id)) url = outputs[0]["path"] if use_pandas: data = pd.read_csv(url, **kwargs) else: r = requests.get(url) r.raise_for_status() data = list(csv.reader(StringIO(r.text), **kwargs)) return data
def test_poller_call_count_poll_on_creation_false(self, mock_api): poller = _create_poller_mock("succeeded") CivisFuture(poller, (1, 2), poll_on_creation=False) assert poller.call_count == 0
def test_overwrite_polling_interval_with_channels(self, *mocks): future = CivisFuture(lambda x: x, (1, 20)) assert future.polling_interval == _LONG_POLLING_INTERVAL assert hasattr(future, '_pubnub')
def test_explicit_polling_interval_with_channels(self, *mocks): future = CivisFuture(lambda x: x, (1, 20), polling_interval=5) assert future.polling_interval == 5 assert hasattr(future, '_pubnub')
def read_civis_sql(sql, database, use_pandas=False, job_name=None, api_key=None, client=None, credential_id=None, polling_interval=None, archive=False, hidden=True, **kwargs): """Read data from Civis using a custom SQL string. The custom SQL string will be executed twice; once to attempt to retrieve headers and once to retrieve the data. This is done to use a more performant method for retrieving the data. The first execution of the custom SQL is controlled such that changes in state cannot occur (e.g., INSERT, UPDATE, DELETE, etc.). Parameters ---------- sql : str, optional The SQL select string to be executed. database : str or int Execute the query against this database. Can be the database name or ID. use_pandas : bool, optional If ``True``, return a :class:`pandas:pandas.DataFrame`. Otherwise, return a list of results from :func:`python:csv.reader`. job_name : str, optional A name to give the job. If omitted, a random job name will be used. api_key : DEPRECATED str, optional Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY` environment variable will be used. client : :class:`civis.APIClient`, optional If not provided, an :class:`civis.APIClient` object will be created from the :envvar:`CIVIS_API_KEY`. credential_id : str or int, optional The database credential ID. If ``None``, the default credential will be used. polling_interval : int or float, optional Number of seconds to wait between checks for query completion. archive : bool, optional (deprecated) If ``True``, archive the import job as soon as it completes. hidden : bool, optional If ``True`` (the default), this job will not appear in the Civis UI. **kwargs : kwargs Extra keyword arguments are passed into :func:`pandas:pandas.read_csv` if `use_pandas` is ``True`` or passed into :func:`python:csv.reader` if `use_pandas` is ``False``. Returns ------- data : :class:`pandas:pandas.DataFrame` or list A list of rows (with header as first row) if `use_pandas` is ``False``, otherwise a `pandas` `DataFrame`. Note that if `use_pandas` is ``False``, no parsing of types is performed and each row will be a list of strings. Raises ------ ImportError If `use_pandas` is ``True`` and `pandas` is not installed. Examples -------- >>> sql = "SELECT * FROM schema.table" >>> df = read_civis_sql(sql, "my_database", use_pandas=True) >>> col_a = df["column_a"] >>> data = read_civis_sql(sql, "my_database") >>> columns = data.pop(0) >>> col_a_index = columns.index("column_a") >>> col_a = [row[col_a_index] for row in data] Notes ----- This reads the data into memory. See Also -------- civis.io.read_civis : Read directly into memory without SQL. civis.io.civis_to_csv : Write directly to a CSV file. """ if client is None: client = APIClient(api_key=api_key, resources='all') if use_pandas and NO_PANDAS: raise ImportError("use_pandas is True but pandas is not installed.") if archive: warnings.warn( "`archive` is deprecated and will be removed in v2.0.0. " "Use `hidden` instead.", FutureWarning) db_id = client.get_database_id(database) credential_id = credential_id or client.default_credential # determine if we can request headers separately; if we can then Platform # will perform a parallel unload which is significantly more performant # we start by assuming headers are requested ovrd_include_header, headers = _include_header(client, sql, True, db_id, credential_id, polling_interval) # if we retrieved headers then we are performing a parallel unload # in which case we need to specify backslash as the escapechar if headers is not None: kwargs['escapechar'] = '\\' csv_settings = dict(include_header=ovrd_include_header, compression='gzip') script_id, run_id = _sql_script(client, sql, db_id, job_name, credential_id, csv_settings=csv_settings, hidden=hidden) fut = CivisFuture(client.scripts.get_sql_runs, (script_id, run_id), polling_interval=polling_interval, client=client, poll_on_creation=False) if archive: def f(x): return client.scripts.put_sql_archive(script_id, True) fut.add_done_callback(f) fut.result() outputs = client.scripts.get_sql_runs(script_id, run_id)["output"] if not outputs: raise EmptyResultError( "Query {} returned no output.".format(script_id)) url = outputs[0]["path"] file_id = outputs[0]["file_id"] log.debug('Exported results to Civis file %s (%s)', outputs[0]["output_name"], file_id) if use_pandas: # allows users to enter their own names parameter _kwargs = {'names': headers} _kwargs.update(kwargs) _kwargs['compression'] = 'gzip' data = pd.read_csv(url, **_kwargs) else: response = requests.get(url, stream=True) response.raise_for_status() with StringIO() as buf: if headers: buf.write(','.join(headers) + '\n') _decompress_stream(response, buf, write_bytes=False) buf.seek(0) data = list(csv.reader(buf, **kwargs)) return data
def civis_to_csv(filename, sql, database, job_name=None, api_key=None, client=None, credential_id=None, include_header=True, compression='none', delimiter=',', unquoted=False, archive=False, hidden=True, polling_interval=None): """Export data from Civis to a local CSV file. The custom SQL string will be executed twice; once to attempt to retrieve headers and once to retrieve the data. This is done to use a more performant method for retrieving the data. The first execution of the custom SQL is controlled such that changes in state cannot occur (e.g., INSERT, UPDATE, DELETE, etc.). Parameters ---------- filename : str Download exported data into this file. sql : str, optional The SQL select string to be executed. database : str or int Export data from this database. Can be the database name or ID. job_name : str, optional A name to give the job. If omitted, a random job name will be used. api_key : DEPRECATED str, optional Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY` environment variable will be used. client : :class:`civis.APIClient`, optional If not provided, an :class:`civis.APIClient` object will be created from the :envvar:`CIVIS_API_KEY`. credential_id : str or int, optional The ID of the database credential. If ``None``, the default credential will be used. include_header: bool, optional If ``True``, the first line of the CSV will be headers. Default: ``True``. compression: str, optional Type of compression to use, if any. One of ``'none'``, ``'zip'``, or ``'gzip'``. Default ``'none'``. ``'gzip'`` currently returns a file with no compression unless include_header is set to False. In a future release, a ``'gzip'`` compressed file will be returned for all cases. delimiter: str, optional Which delimiter to use, if any. One of ``','``, ``'\t'``, or ``'|'``. Default: ``','``. unquoted: bool, optional Whether or not to quote fields. Default: ``False``. polling_interval : int or float, optional Number of seconds to wait between checks for query completion. archive : bool, optional (deprecated) If ``True``, archive the import job as soon as it completes. hidden : bool, optional If ``True`` (the default), this job will not appear in the Civis UI. Returns ------- results : :class:`~civis.futures.CivisFuture` A `CivisFuture` object. Examples -------- >>> sql = "SELECT * FROM schema.table" >>> fut = civis_to_csv("file.csv", sql, "my_database") >>> fut.result() # Wait for job to complete See Also -------- civis.io.read_civis : Read table contents into memory. civis.io.read_civis_sql : Read results of a SQL query into memory. """ if archive: warnings.warn( "`archive` is deprecated and will be removed in v2.0.0. " "Use `hidden` instead.", FutureWarning) if client is None: client = APIClient(api_key=api_key, resources='all') db_id = client.get_database_id(database) credential_id = credential_id or client.default_credential # don't fix bug that would cause breaking change for now # when gzip compression is requested, a gzip file is not actually returned # instead the gzip file is decompressed during download if compression == 'gzip' and include_header: compression = 'none' # don't support parallel unload; the output format # is different which would introduce a breaking change headers = b'' delimiter = DELIMITERS.get(delimiter) if not delimiter: raise ValueError("delimiter must be one of {}".format( DELIMITERS.keys())) # always set compression to gzip to reduce I/O csv_settings = dict(include_header=include_header, compression='gzip', column_delimiter=delimiter, unquoted=unquoted, filename_prefix=None, force_multifile=False) script_id, run_id = _sql_script(client, sql, db_id, job_name, credential_id, hidden=hidden, csv_settings=csv_settings) fut = CivisFuture(client.scripts.get_sql_runs, (script_id, run_id), polling_interval=polling_interval, client=client, poll_on_creation=False) download = _download_callback(script_id, run_id, filename, headers, compression) fut.add_done_callback(download) if archive: def f(x): return client.scripts.put_sql_archive(script_id, True) fut.add_done_callback(f) return fut
def civis_file_to_table(file_id, database, table, client=None, max_errors=None, existing_table_rows="fail", diststyle=None, distkey=None, sortkey1=None, sortkey2=None, delimiter=",", headers=None, credential_id=None, polling_interval=None, hidden=True): """Upload the contents of a Civis file to a Civis table. Parameters ---------- file_id : int Civis file ID. database : str or int Upload data into this database. Can be the database name or ID. table : str The schema and table you want to upload to. E.g., ``'scratch.table'``. client : :class:`civis.APIClient`, optional If not provided, an :class:`civis.APIClient` object will be created from the :envvar:`CIVIS_API_KEY`. max_errors : int, optional The maximum number of rows with errors to remove from the import before failing. existing_table_rows : str, optional The behaviour if a table with the requested name already exists. One of ``'fail'``, ``'truncate'``, ``'append'`` or ``'drop'``. Defaults to ``'fail'``. diststyle : str, optional The distribution style for the table. One of ``'even'``, ``'all'`` or ``'key'``. distkey : str, optional The column to use as the distkey for the table. sortkey1 : str, optional The column to use as the sortkey for the table. sortkey2 : str, optional The second column in a compound sortkey for the table. delimiter : string, optional The column delimiter. One of ``','``, ``'\\t'`` or ``'|'``. headers : bool, optional Whether or not the first row of the file should be treated as headers. The default, ``None``, attempts to autodetect whether or not the first row contains headers. credential_id : str or int, optional The ID of the database credential. If ``None``, the default credential will be used. polling_interval : int or float, optional Number of seconds to wait between checks for job completion. hidden : bool, optional If ``True`` (the default), this job will not appear in the Civis UI. Returns ------- results : :class:`~civis.futures.CivisFuture` A `CivisFuture` object. Examples -------- >>> file_id = 100 >>> fut = civis.io.civis_file_to_table(file_id, ... 'my-database', ... 'scratch.my_data') >>> fut.result() """ if client is None: client = APIClient(resources='all') schema, table = table.split(".", 1) db_id = client.get_database_id(database) cred_id = credential_id or client.default_credential delimiter = DELIMITERS.get(delimiter) assert delimiter, "delimiter must be one of {}".format(DELIMITERS.keys()) destination = dict(remote_host_id=db_id, credential_id=cred_id) import_name = 'CSV import to {}.{}'.format(schema, table) import_job = client.imports.post(import_name, 'AutoImport', is_outbound=False, destination=destination, hidden=hidden) options = dict(max_errors=max_errors, existing_table_rows=existing_table_rows, diststyle=diststyle, distkey=distkey, sortkey1=sortkey1, sortkey2=sortkey2, column_delimiter=delimiter, first_row_is_header=headers) client.imports.post_syncs( import_job.id, source=dict(file=dict(id=file_id)), destination=dict(database_table=dict(schema=schema, table=table)), advanced_options=options) run = client.jobs.post_runs(import_job.id) fut = CivisFuture(client.jobs.get_runs, (import_job.id, run['id']), polling_interval=polling_interval, client=client, poll_on_creation=False) return fut
def transfer_table(source_db, dest_db, source_table, dest_table, job_name=None, api_key=None, client=None, source_credential_id=None, dest_credential_id=None, polling_interval=None, **advanced_options): """Transfer a table from one location to another. Parameters ---------- source_db : str or int The name of the database where the source table is located. Optionally, could be the database ID. dest_db : str or int The name of the database where the table will be transfered. Optionally, could be the database ID. source_table : str Full name of the table to transfer, e.g., ``'schema.table'``. dest_table : str Full name of the table in the destination database, e.g., ``'schema.table'``. job_name : str, optional A name to give the job. If omitted, a random job name will be used. api_key : DEPRECATED str, optional Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY` environment variable will be used. client : :class:`civis.APIClient`, optional If not provided, an :class:`civis.APIClient` object will be created from the :envvar:`CIVIS_API_KEY`. source_credential_id : str or int, optional Optional credential ID for the source database. If ``None``, the default credential will be used. dest_credential_id : str or int, optional Optional credential ID for the destination database. If ``None``, the default credential will be used. polling_interval : int or float, optional Number of seconds to wait between checks for job completion. **advanced_options : kwargs Extra keyword arguments will be passed to the import sync job. See :func:`~civis.resources._resources.Imports.post_syncs`. Returns ------- results : :class:`~civis.futures.CivisFuture` A `CivisFuture` object. Examples -------- >>> transfer_table(source_db='Cluster A', dest_db='Cluster B', ... source_table='schma.tbl', dest_table='schma.tbl') """ if client is None: client = APIClient(api_key=api_key) source_cred_id = source_credential_id or client.default_credential dest_cred_id = dest_credential_id or client.default_credential job_name = maybe_get_random_name(job_name) source = { 'remote_host_id': client.get_database_id(source_db), 'credential_id': source_cred_id } destination = { 'remote_host_id': client.get_database_id(dest_db), 'credential_id': dest_cred_id } job_id = client.imports.post(job_name, "Dbsync", True, source=source, destination=destination).id client.imports.post_syncs(id=job_id, source={'path': source_table}, destination={'path': dest_table}, advanced_options=advanced_options) run_id = client.imports.post_runs(id=job_id).run_id log.debug('Started run %d of sync for import %d', run_id, job_id) fut = CivisFuture(client.imports.get_files_runs, (job_id, run_id), polling_interval=polling_interval, client=client, poll_on_creation=False) return fut
def test_set_api_result_succeeded(self, mock_api): poller = _create_poller_mock("succeeded") result = CivisFuture(poller, (1, 2)) assert result._state == 'FINISHED'