def query_civis(sql, database, api_key=None, client=None, credential_id=None, preview_rows=10, polling_interval=None, hidden=True): """Execute a SQL statement as a Civis query. Run a query that may return no results or where only a small preview is required. To execute a query that returns a large number of rows, see :func:`~civis.io.read_civis_sql`. Parameters ---------- sql : str The SQL statement to execute. database : str or int The name or ID of the database. api_key : DEPRECATED str, optional Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY` environment variable will be used. client : :class:`civis.APIClient`, optional If not provided, an :class:`civis.APIClient` object will be created from the :envvar:`CIVIS_API_KEY`. credential_id : str or int, optional The ID of the database credential. If ``None``, the default credential will be used. preview_rows : int, optional The maximum number of rows to return. No more than 100 rows can be returned at once. polling_interval : int or float, optional Number of seconds to wait between checks for query completion. hidden : bool, optional If ``True`` (the default), this job will not appear in the Civis UI. Returns ------- results : :class:`~civis.futures.CivisFuture` A `CivisFuture` object. Examples -------- >>> run = query_civis(sql="DELETE schema.table", database='database') >>> run.result() # Wait for query to complete """ if client is None: client = APIClient(api_key=api_key) database_id = client.get_database_id(database) cred_id = credential_id or client.default_credential resp = client.queries.post(database_id, sql, preview_rows, credential=cred_id, hidden=hidden) return CivisFuture(client.queries.get, (resp.id, ), polling_interval, client=client, poll_on_creation=False)
def query_civis(sql, database, api_key=None, credential_id=None, preview_rows=10, polling_interval=_DEFAULT_POLLING_INTERVAL): """Execute a SQL statement as a Civis query. Run a query that may return no results or where only a small preview is required. To execute a query that returns a large number of rows, see :func:`~civis.io.read_civis_sql`. Parameters ---------- sql : str The SQL statement to execute. database : str or int The name or ID of the database. api_key : str, optional Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY` environment variable will be used. credential_id : str or int, optional The ID of the database credential. If ``None``, the default credential will be used. preview_rows : int, optional The maximum number of rows to return. No more than 100 rows can be returned at once. polling_interval : int or float, optional Number of seconds to wait between checks for query completion. Returns ------- results : :class:`~civis.polling.PollableResult` A `PollableResult` object. Examples -------- >>> run = query_civis(sql="DELETE schema.table", database='database') >>> run.result() # Wait for query to complete """ client = APIClient(api_key=api_key) database_id = client.get_database_id(database) cred_id = credential_id or client.default_credential resp = client.queries.post(database_id, sql, preview_rows, credential=cred_id) return PollableResult(client.queries.get, (resp.id, ), polling_interval)
def _import_bytes(buf, database, table, api_key, max_errors, existing_table_rows, distkey, sortkey1, sortkey2, delimiter, headers, credential_id, polling_interval, archive, hidden): client = APIClient(api_key=api_key) schema, table = table.split(".", 1) db_id = client.get_database_id(database) cred_id = credential_id or client.default_credential delimiter = DELIMITERS.get(delimiter) assert delimiter, "delimiter must be one of {}".format(DELIMITERS.keys()) kwargs = dict(schema=schema, name=table, remote_host_id=db_id, credential_id=cred_id, max_errors=max_errors, existing_table_rows=existing_table_rows, distkey=distkey, sortkey1=sortkey1, sortkey2=sortkey2, column_delimiter=delimiter, first_row_is_header=headers, hidden=hidden) import_job = client.imports.post_files(**kwargs) put_response = requests.put(import_job.upload_uri, buf) put_response.raise_for_status() run_job_result = client._session.post(import_job.run_uri) run_job_result.raise_for_status() run_info = run_job_result.json() fut = CivisFuture(client.imports.get_files_runs, (run_info['importId'], run_info['id']), polling_interval=polling_interval, api_key=api_key, poll_on_creation=False) if archive: def f(x): return client.imports.put_archive(import_job.id, True) fut.add_done_callback(f) return fut
def transfer_table(source_db, dest_db, source_table, dest_table, job_name=None, api_key=None, source_credential_id=None, dest_credential_id=None, polling_interval=_DEFAULT_POLLING_INTERVAL, **advanced_options): """Transfer a table from one location to another. Parameters ---------- source_db : str or int The name of the database where the source table is located. Optionally, could be the database ID. dest_db : str or int The name of the database where the table will be transfered. Optionally, could be the database ID. source_table : str Full name of the table to transfer, e.g., ``'schema.table'``. dest_table : str Full name of the table in the destination database, e.g., ``'schema.table'``. job_name : str, optional A name to give the job. If omitted, a random job name will be used. api_key : str, optional Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY` environment variable will be used. source_credential_id : str or int, optional Optional credential ID for the source database. If ``None``, the default credential will be used. dest_credential_id : str or int, optional Optional credential ID for the destination database. If ``None``, the default credential will be used. polling_interval : int or float, optional Number of seconds to wait between checks for job completion. **advanced_options : kwargs Extra keyword arguments will be passed to the import sync job. See :func:`~civis.resources._resources.Imports.post_syncs`. Returns ------- results : :class:`~civis.polling.PollableResult` A `PollableResult` object. Examples -------- >>> transfer_table(source_db='Cluster A', dest_db='Cluster B', ... source_table='schma.tbl', dest_table='schma.tbl') """ client = APIClient(api_key=api_key) source_cred_id = source_credential_id or client.default_credential dest_cred_id = dest_credential_id or client.default_credential job_name = maybe_get_random_name(job_name) source = { 'remote_host_id': client.get_database_id(source_db), 'credential_id': source_cred_id } destination = { 'remote_host_id': client.get_database_id(dest_db), 'credential_id': dest_cred_id } job_id = client.imports.post(job_name, "Dbsync", True, source=source, destination=destination).id client.imports.post_syncs(id=job_id, source={'path': source_table}, destination={'path': dest_table}, advanced_options=advanced_options) run_id = client.imports.post_runs(id=job_id).run_id poll = PollableResult(client.imports.get_files_runs, (job_id, run_id), polling_interval) return poll
def civis_file_to_table(file_id, database, table, client=None, max_errors=None, existing_table_rows="fail", diststyle=None, distkey=None, sortkey1=None, sortkey2=None, delimiter=",", headers=None, credential_id=None, polling_interval=None, hidden=True): """Upload the contents of a Civis file to a Civis table. Parameters ---------- file_id : int Civis file ID. database : str or int Upload data into this database. Can be the database name or ID. table : str The schema and table you want to upload to. E.g., ``'scratch.table'``. client : :class:`civis.APIClient`, optional If not provided, an :class:`civis.APIClient` object will be created from the :envvar:`CIVIS_API_KEY`. max_errors : int, optional The maximum number of rows with errors to remove from the import before failing. existing_table_rows : str, optional The behaviour if a table with the requested name already exists. One of ``'fail'``, ``'truncate'``, ``'append'`` or ``'drop'``. Defaults to ``'fail'``. diststyle : str, optional The distribution style for the table. One of ``'even'``, ``'all'`` or ``'key'``. distkey : str, optional The column to use as the distkey for the table. sortkey1 : str, optional The column to use as the sortkey for the table. sortkey2 : str, optional The second column in a compound sortkey for the table. delimiter : string, optional The column delimiter. One of ``','``, ``'\\t'`` or ``'|'``. headers : bool, optional Whether or not the first row of the file should be treated as headers. The default, ``None``, attempts to autodetect whether or not the first row contains headers. credential_id : str or int, optional The ID of the database credential. If ``None``, the default credential will be used. polling_interval : int or float, optional Number of seconds to wait between checks for job completion. hidden : bool, optional If ``True`` (the default), this job will not appear in the Civis UI. Returns ------- results : :class:`~civis.futures.CivisFuture` A `CivisFuture` object. Examples -------- >>> file_id = 100 >>> fut = civis.io.civis_file_to_table(file_id, ... 'my-database', ... 'scratch.my_data') >>> fut.result() """ if client is None: client = APIClient(resources='all') schema, table = table.split(".", 1) db_id = client.get_database_id(database) cred_id = credential_id or client.default_credential delimiter = DELIMITERS.get(delimiter) assert delimiter, "delimiter must be one of {}".format(DELIMITERS.keys()) destination = dict(remote_host_id=db_id, credential_id=cred_id) import_name = 'CSV import to {}.{}'.format(schema, table) import_job = client.imports.post(import_name, 'AutoImport', is_outbound=False, destination=destination, hidden=hidden) options = dict(max_errors=max_errors, existing_table_rows=existing_table_rows, diststyle=diststyle, distkey=distkey, sortkey1=sortkey1, sortkey2=sortkey2, column_delimiter=delimiter, first_row_is_header=headers) client.imports.post_syncs( import_job.id, source=dict(file=dict(id=file_id)), destination=dict(database_table=dict(schema=schema, table=table)), advanced_options=options) run = client.jobs.post_runs(import_job.id) fut = CivisFuture(client.jobs.get_runs, (import_job.id, run['id']), polling_interval=polling_interval, client=client, poll_on_creation=False) return fut
def civis_to_csv(filename, sql, database, job_name=None, api_key=None, client=None, credential_id=None, include_header=True, compression='none', delimiter=',', unquoted=False, archive=False, hidden=True, polling_interval=None): """Export data from Civis to a local CSV file. The custom SQL string will be executed twice; once to attempt to retrieve headers and once to retrieve the data. This is done to use a more performant method for retrieving the data. The first execution of the custom SQL is controlled such that changes in state cannot occur (e.g., INSERT, UPDATE, DELETE, etc.). Parameters ---------- filename : str Download exported data into this file. sql : str, optional The SQL select string to be executed. database : str or int Export data from this database. Can be the database name or ID. job_name : str, optional A name to give the job. If omitted, a random job name will be used. api_key : DEPRECATED str, optional Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY` environment variable will be used. client : :class:`civis.APIClient`, optional If not provided, an :class:`civis.APIClient` object will be created from the :envvar:`CIVIS_API_KEY`. credential_id : str or int, optional The ID of the database credential. If ``None``, the default credential will be used. include_header: bool, optional If ``True``, the first line of the CSV will be headers. Default: ``True``. compression: str, optional Type of compression to use, if any. One of ``'none'``, ``'zip'``, or ``'gzip'``. Default ``'none'``. ``'gzip'`` currently returns a file with no compression unless include_header is set to False. In a future release, a ``'gzip'`` compressed file will be returned for all cases. delimiter: str, optional Which delimiter to use, if any. One of ``','``, ``'\t'``, or ``'|'``. Default: ``','``. unquoted: bool, optional Whether or not to quote fields. Default: ``False``. polling_interval : int or float, optional Number of seconds to wait between checks for query completion. archive : bool, optional (deprecated) If ``True``, archive the import job as soon as it completes. hidden : bool, optional If ``True`` (the default), this job will not appear in the Civis UI. Returns ------- results : :class:`~civis.futures.CivisFuture` A `CivisFuture` object. Examples -------- >>> sql = "SELECT * FROM schema.table" >>> fut = civis_to_csv("file.csv", sql, "my_database") >>> fut.result() # Wait for job to complete See Also -------- civis.io.read_civis : Read table contents into memory. civis.io.read_civis_sql : Read results of a SQL query into memory. """ if archive: warnings.warn( "`archive` is deprecated and will be removed in v2.0.0. " "Use `hidden` instead.", FutureWarning) if client is None: client = APIClient(api_key=api_key, resources='all') db_id = client.get_database_id(database) credential_id = credential_id or client.default_credential # don't fix bug that would cause breaking change for now # when gzip compression is requested, a gzip file is not actually returned # instead the gzip file is decompressed during download if compression == 'gzip' and include_header: compression = 'none' # don't support parallel unload; the output format # is different which would introduce a breaking change headers = b'' delimiter = DELIMITERS.get(delimiter) if not delimiter: raise ValueError("delimiter must be one of {}".format( DELIMITERS.keys())) # always set compression to gzip to reduce I/O csv_settings = dict(include_header=include_header, compression='gzip', column_delimiter=delimiter, unquoted=unquoted, filename_prefix=None, force_multifile=False) script_id, run_id = _sql_script(client, sql, db_id, job_name, credential_id, hidden=hidden, csv_settings=csv_settings) fut = CivisFuture(client.scripts.get_sql_runs, (script_id, run_id), polling_interval=polling_interval, client=client, poll_on_creation=False) download = _download_callback(script_id, run_id, filename, headers, compression) fut.add_done_callback(download) if archive: def f(x): return client.scripts.put_sql_archive(script_id, True) fut.add_done_callback(f) return fut
def read_civis_sql(sql, database, use_pandas=False, job_name=None, api_key=None, client=None, credential_id=None, polling_interval=None, archive=False, hidden=True, **kwargs): """Read data from Civis using a custom SQL string. The custom SQL string will be executed twice; once to attempt to retrieve headers and once to retrieve the data. This is done to use a more performant method for retrieving the data. The first execution of the custom SQL is controlled such that changes in state cannot occur (e.g., INSERT, UPDATE, DELETE, etc.). Parameters ---------- sql : str, optional The SQL select string to be executed. database : str or int Execute the query against this database. Can be the database name or ID. use_pandas : bool, optional If ``True``, return a :class:`pandas:pandas.DataFrame`. Otherwise, return a list of results from :func:`python:csv.reader`. job_name : str, optional A name to give the job. If omitted, a random job name will be used. api_key : DEPRECATED str, optional Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY` environment variable will be used. client : :class:`civis.APIClient`, optional If not provided, an :class:`civis.APIClient` object will be created from the :envvar:`CIVIS_API_KEY`. credential_id : str or int, optional The database credential ID. If ``None``, the default credential will be used. polling_interval : int or float, optional Number of seconds to wait between checks for query completion. archive : bool, optional (deprecated) If ``True``, archive the import job as soon as it completes. hidden : bool, optional If ``True`` (the default), this job will not appear in the Civis UI. **kwargs : kwargs Extra keyword arguments are passed into :func:`pandas:pandas.read_csv` if `use_pandas` is ``True`` or passed into :func:`python:csv.reader` if `use_pandas` is ``False``. Returns ------- data : :class:`pandas:pandas.DataFrame` or list A list of rows (with header as first row) if `use_pandas` is ``False``, otherwise a `pandas` `DataFrame`. Note that if `use_pandas` is ``False``, no parsing of types is performed and each row will be a list of strings. Raises ------ ImportError If `use_pandas` is ``True`` and `pandas` is not installed. Examples -------- >>> sql = "SELECT * FROM schema.table" >>> df = read_civis_sql(sql, "my_database", use_pandas=True) >>> col_a = df["column_a"] >>> data = read_civis_sql(sql, "my_database") >>> columns = data.pop(0) >>> col_a_index = columns.index("column_a") >>> col_a = [row[col_a_index] for row in data] Notes ----- This reads the data into memory. See Also -------- civis.io.read_civis : Read directly into memory without SQL. civis.io.civis_to_csv : Write directly to a CSV file. """ if client is None: client = APIClient(api_key=api_key, resources='all') if use_pandas and NO_PANDAS: raise ImportError("use_pandas is True but pandas is not installed.") if archive: warnings.warn( "`archive` is deprecated and will be removed in v2.0.0. " "Use `hidden` instead.", FutureWarning) db_id = client.get_database_id(database) credential_id = credential_id or client.default_credential # determine if we can request headers separately; if we can then Platform # will perform a parallel unload which is significantly more performant # we start by assuming headers are requested ovrd_include_header, headers = _include_header(client, sql, True, db_id, credential_id, polling_interval) # if we retrieved headers then we are performing a parallel unload # in which case we need to specify backslash as the escapechar if headers is not None: kwargs['escapechar'] = '\\' csv_settings = dict(include_header=ovrd_include_header, compression='gzip') script_id, run_id = _sql_script(client, sql, db_id, job_name, credential_id, csv_settings=csv_settings, hidden=hidden) fut = CivisFuture(client.scripts.get_sql_runs, (script_id, run_id), polling_interval=polling_interval, client=client, poll_on_creation=False) if archive: def f(x): return client.scripts.put_sql_archive(script_id, True) fut.add_done_callback(f) fut.result() outputs = client.scripts.get_sql_runs(script_id, run_id)["output"] if not outputs: raise EmptyResultError( "Query {} returned no output.".format(script_id)) url = outputs[0]["path"] file_id = outputs[0]["file_id"] log.debug('Exported results to Civis file %s (%s)', outputs[0]["output_name"], file_id) if use_pandas: # allows users to enter their own names parameter _kwargs = {'names': headers} _kwargs.update(kwargs) _kwargs['compression'] = 'gzip' data = pd.read_csv(url, **_kwargs) else: response = requests.get(url, stream=True) response.raise_for_status() with StringIO() as buf: if headers: buf.write(','.join(headers) + '\n') _decompress_stream(response, buf, write_bytes=False) buf.seek(0) data = list(csv.reader(buf, **kwargs)) return data
def csv_to_civis(filename, database, table, api_key=None, max_errors=None, existing_table_rows="fail", distkey=None, sortkey1=None, sortkey2=None, delimiter=",", headers=None, credential_id=None, polling_interval=_DEFAULT_POLLING_INTERVAL, archive=True): """Upload the contents of a local CSV file to Civis. Parameters ---------- filename : str Upload the contents of this file. database : str or int Upload data into this database. Can be the database name or ID. table : str The schema and table you want to upload to. E.g., ``'scratch.table'``. api_key : str, optional Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY` environment variable will be used. max_errors : int, optional The maximum number of rows with errors to remove from the import before failing. existing_table_rows : str, optional The behaviour if a table with the requested name already exists. One of ``'fail'``, ``'truncate'`` or ``'append'``. Defaults to ``'fail'``. distkey : str, optional The column to use as the distkey for the table. sortkey1 : str, optional The column to use as the sortkey for the table. sortkey2 : str, optional The second column in a compound sortkey for the table. delimiter : string, optional The column delimiter. One of ``','``, ``'\\t'`` or ``'|'``. headers : bool, optional Whether or not the first row of the file should be treated as headers. The default, ``None``, attempts to autodetect whether or not the first row contains headers. credential_id : str or int, optional The ID of the database credential. If ``None``, the default credential will be used. polling_interval : int or float, optional Number of seconds to wait between checks for job completion. archive : bool, optional If ``True`` (the default), archive the import job as soon as it completes. Returns ------- results : :class:`~civis.polling.PollableResult` A `PollableResult` object. Notes ----- This reads the contents of `filename` into memory. Examples -------- >>> with open('input_file.csv', 'w') as _input: ... _input.write('a,b,c\\n1,2,3') >>> poller = civis.io.csv_to_civis('input_file.csv', ... 'my-database', ... 'scratch.my_data') >>> poller.result() """ client = APIClient(api_key=api_key) schema, table = table.split(".", 1) db_id = client.get_database_id(database) cred_id = credential_id or client.default_credential delimiter = DELIMITERS.get(delimiter) assert delimiter, "delimiter must be one of {}".format(DELIMITERS.keys()) kwargs = dict(schema=schema, name=table, remote_host_id=db_id, credential_id=cred_id, max_errors=max_errors, existing_table_rows=existing_table_rows, distkey=distkey, sortkey1=sortkey1, sortkey2=sortkey2, column_delimiter=delimiter, first_row_is_header=headers) import_job = client.imports.post_files(**kwargs) with open(filename, "rb") as data: put_response = requests.put(import_job.upload_uri, data) put_response.raise_for_status() run_job_result = client._session.post(import_job.run_uri) run_job_result.raise_for_status() run_info = run_job_result.json() poll = PollableResult(client.imports.get_files_runs, (run_info['importId'], run_info['id']), polling_interval=polling_interval) if archive: def f(x): return client.imports.put_archive(import_job.id, True) poll.add_done_callback(f) return poll
class ModelPipeline: """Interface for scikit-learn modeling in the Civis Platform Each ModelPipeline corresponds to a scikit-learn :class:`~sklearn.pipeline.Pipeline` which will run in Civis Platform. Note that this object can be safely pickled and unpickled, but it does not store the state of any attached :class:`~civis.APIClient` object. An unpickled ModelPipeline will use the API key from the user's environment. Parameters ---------- model : string or Estimator Either the name of a pre-defined model (e.g. "sparse_logistic" or "gradient_boosting_classifier") or else a pre-existing Estimator object. dependent_variable : string or List[str] The dependent variable of the training dataset. For a multi-target problem, this should be a list of column names of dependent variables. primary_key : string, optional The unique ID (primary key) of the training dataset. This will be used to index the out-of-sample scores. parameters : dict, optional Specify parameters for the final stage estimator in a predefined model, e.g. ``{'C': 2}`` for a "sparse_logistic" model. cross_validation_parameters : dict, optional Cross validation parameter grid for learner parameters, e.g. ``{{'n_estimators': [100, 200, 500], 'learning_rate': [0.01, 0.1], 'max_depth': [2, 3]}}``. model_name : string, optional The prefix of the Platform modeling jobs. It will have " Train" or " Predict" added to become the Script title. calibration : {None, "sigmoid", "isotonic"} If not None, calibrate output probabilities with the selected method. Valid only with classification models. excluded_columns : array, optional A list of columns which will be considered ineligible to be independent variables. client : :class:`~civis.APIClient`, optional If not provided, an :class:`~civis.APIClient` object will be created from the :envvar:`CIVIS_API_KEY`. cpu_requested : int, optional Number of CPU shares requested in the Civis Platform for training jobs. 1024 shares = 1 CPU. memory_requested : int, optional Memory requested from Civis Platform for training jobs, in MiB disk_requested : float, optional Disk space requested on Civis Platform for training jobs, in GB notifications : dict See :func:`~civis.resources._resources.Scripts.post_custom` for further documentation about email and URL notification. dependencies : array, optional List of packages to install from PyPI or git repository (i.e., Github or Bitbucket). If a private repo is specified, please include a ``git_token_name`` argument as well (see below). Make sure to pin dependencies to a specific version, since dependecies will be reinstalled during every training and predict job. git_token_name : str, optional Name of remote git API token stored in Civis Platform as the password field in a custom platform credential. Used only when installing private git repositories. verbose : bool, optional If True, supply debug outputs in Platform logs and make prediction child jobs visible. etl : Estimator, optional Custom ETL estimator which overrides the default ETL, and is run before training and validation. Methods ------- train() Train the model on data in Civis Platform; outputs :class:`~civis.ml.ModelFuture` predict() Make predictions on new data; outputs :class:`~civis.ml.ModelFuture` from_existing() Class method; use to create a :class:`~civis.ml.ModelPipeline` from an existing model training run Attributes ---------- estimator : :class:`~sklearn.pipeline.Pipeline` The trained scikit-learn Pipeline train_result_ : :class:`~civis.ml.ModelFuture` :class:`~civis.ml.ModelFuture` encapsulating this model's training run state : str Status of the training job (non-blocking) Examples -------- >>> from civis.ml import ModelPipeline >>> model = ModelPipeline('gradient_boosting_classifier', 'depvar', ... primary_key='voterbase_id') >>> train = model.train(table_name='schema.survey_data', ... fit_params={'sample_weight': 'survey_weight'}, ... database_name='My Redshift Cluster', ... oos_scores='scratch.survey_depvar_oos_scores') >>> train <ModelFuture at 0x11be7ae10 state=queued> >>> train.running() True >>> train.done() False >>> df = train.table # Read OOS scores from its Civis File. Blocking. >>> meta = train.metadata # Metadata from training run >>> train.metrics['roc_auc'] 0.88425 >>> pred = model.predict(table_name='schema.demographics_table ', ... database_name='My Redshift Cluster', ... output_table='schema.predicted_survey_response', ... if_exists='drop', ... n_jobs=50) >>> df_pred = pred.table # Blocks until finished # Modify the parameters of the base estimator in a default model: >>> model = ModelPipeline('sparse_logistic', 'depvar', ... primary_key='voterbase_id', ... parameters={'C': 2}) # Grid search over hyperparameters in the base estimator: >>> model = ModelPipeline('sparse_logistic', 'depvar', ... primary_key='voterbase_id', ... cross_validation_parameters={'C': [0.1, 1, 10]}) See Also -------- civis.ml.ModelFuture """ # These are the v2.0 templates train_template_id = 9968 predict_template_id = 9969 # These are the v1.1 templates _train_template_id_fallback = 9112 _predict_template_id_fallback = 9113 def _set_template_version(self, client): """Determine which version of CivisML to use. If the user has access to the newest templates, use them, otherwise fall back to the previous version. Used for internal or limited releases of new CivisML versions.""" if '_NEWEST_CIVISML_VERSION' not in globals(): global _NEWEST_CIVISML_VERSION try: newest_train = max(_PRED_TEMPLATES.keys()) # Check that we can access the newest templates client.templates.get_scripts(id=newest_train) client.templates.get_scripts(id=_PRED_TEMPLATES[newest_train]) except CivisAPIError: _NEWEST_CIVISML_VERSION = False else: _NEWEST_CIVISML_VERSION = True def __init__(self, model, dependent_variable, primary_key=None, parameters=None, cross_validation_parameters=None, model_name=None, calibration=None, excluded_columns=None, client=None, cpu_requested=None, memory_requested=None, disk_requested=None, notifications=None, dependencies=None, git_token_name=None, verbose=False, etl=None): self.model = model self._input_model = model # In case we need to modify the input if isinstance(dependent_variable, str): # Standardize the dependent variable as a list. dependent_variable = [dependent_variable] self.dependent_variable = dependent_variable # optional but common parameters self.primary_key = primary_key self.parameters = parameters or {} self.cv_params = cross_validation_parameters or {} self.model_name = model_name # None lets Platform use template name self.excluded_columns = excluded_columns self.calibration = calibration self.job_resources = {'REQUIRED_CPU': cpu_requested, 'REQUIRED_MEMORY': memory_requested, 'REQUIRED_DISK_SPACE': disk_requested} self.notifications = notifications or {} self.dependencies = dependencies self.git_token_name = git_token_name self.verbose = verbose if client is None: client = APIClient(resources='all') self._client = client self.train_result_ = None self._set_template_version(client) if _NEWEST_CIVISML_VERSION: self.etl = etl elif not _NEWEST_CIVISML_VERSION and etl is not None: raise NotImplementedError("The etl argument is not implemented" " in this version of CivisML.") else: # fall back to previous version templates self.train_template_id = self._train_template_id_fallback self.predict_template_id = self._predict_template_id_fallback def __getstate__(self): state = self.__dict__.copy() del state['_client'] return state def __setstate__(self, state): self.__dict__ = state self._client = APIClient(resources='all') self._set_template_version(self._client) @classmethod def from_existing(cls, train_job_id, train_run_id='latest', client=None): """Create a :class:`ModelPipeline` object from existing model IDs Parameters ---------- train_job_id : int The ID of the CivisML job in the Civis Platform train_run_id : int or string, optional Location of the model run, either * an explicit run ID, * "latest" : The most recent run * "active" : The run designated by the training job's "active build" parameter client : :class:`~civis.APIClient`, optional If not provided, an :class:`~civis.APIClient` object will be created from the :envvar:`CIVIS_API_KEY`. Returns ------- :class:`~civis.ml.ModelPipeline` A :class:`~civis.ml.ModelPipeline` which refers to a previously-trained model Examples -------- >>> from civis.ml import ModelPipeline >>> model = ModelPipeline.from_existing(job_id) >>> model.train_result_.metrics['roc_auc'] 0.843 """ train_job_id = int(train_job_id) # Convert np.int to int if client is None: client = APIClient(resources='all') train_run_id = _decode_train_run(train_job_id, train_run_id, client) try: fut = ModelFuture(train_job_id, train_run_id, client=client) container = client.scripts.get_containers(train_job_id) except CivisAPIError as api_err: if api_err.status_code == 404: msg = ('There is no Civis Platform job with ' 'script ID {} and run ID {}!'.format(train_job_id, train_run_id)) six.raise_from(ValueError(msg), api_err) raise args = container.arguments # Older templates used "WORKFLOW" instead of "MODEL" model = args.get('MODEL', args.get('WORKFLOW')) dependent_variable = args['TARGET_COLUMN'].split() primary_key = args.get('PRIMARY_KEY') parameters = json.loads(args.get('PARAMS', "{}")) cross_validation_parameters = json.loads(args.get('CVPARAMS', "{}")) calibration = args.get('CALIBRATION') excluded_columns = args.get('EXCLUDE_COLS', None) if excluded_columns: excluded_columns = excluded_columns.split() cpu_requested = args.get('REQUIRED_CPU') memory_requested = args.get('REQUIRED_MEMORY') disk_requested = args.get('REQUIRED_DISK_SPACE') name = container.name if name.endswith(' Train'): # Strip object-applied suffix name = name[:-len(' Train')] notifications = {camel_to_snake(key): val for key, val in container.notifications.items()} dependencies = args.get('DEPENDENCIES', None) if dependencies: dependencies = dependencies.split() git_token_name = args.get('GIT_CRED', None) if git_token_name: git_token_name = client.credentials.get(git_token_name).name klass = cls(model=model, dependent_variable=dependent_variable, primary_key=primary_key, model_name=name, parameters=parameters, cross_validation_parameters=cross_validation_parameters, calibration=calibration, excluded_columns=excluded_columns, client=client, cpu_requested=cpu_requested, disk_requested=disk_requested, memory_requested=memory_requested, notifications=notifications, dependencies=dependencies, git_token_name=git_token_name, verbose=args.get('DEBUG', False)) klass.train_result_ = fut # Set prediction template corresponding to training template template_id = int(container['from_template_id']) p_id = _PRED_TEMPLATES.get(template_id) if p_id is None: warnings.warn('Model %s was trained with a newer version of ' 'CivisML than is available in the API client ' 'version %s. Please update your API client version. ' 'Attempting to use an older version of the ' 'prediction code. Prediction will either fail ' 'immediately or succeed.' % (train_job_id, __version__), RuntimeWarning) p_id = max(_PRED_TEMPLATES.values()) klass.predict_template_id = p_id return klass def train(self, df=None, csv_path=None, table_name=None, database_name=None, file_id=None, sql_where=None, sql_limit=None, oos_scores=None, oos_scores_db=None, if_exists='fail', fit_params=None, polling_interval=None, validation_data='train', n_jobs=4): """Start a Civis Platform job to train your model Provide input through one of a :class:`~pandas.DataFrame` (``df``), a local CSV (``csv_path``), a Civis Table (``table_name`` and ``database_name``), or a Civis File containing a CSV (``file_id``). Model outputs will always contain out-of-sample scores (accessible through :attr:`ModelFuture.table` on this function's output), and you may chose to store these out-of-sample scores in a Civis Table with the ``oos_scores``, ``oos_scores_db``, and ``if_exists`` parameters. Parameters ---------- df : pd.DataFrame, optional A :class:`~pandas.DataFrame` of training data. The :class:`~pandas.DataFrame` will be uploaded to a Civis file so that CivisML can access it. Note that the index of the :class:`~pandas.DataFrame` will be ignored -- use ``df.reset_index()`` if you want your index column to be included with the data passed to CivisML. csv_path : str, optional The location of a CSV of data on the local disk. It will be uploaded to a Civis file. table_name : str, optional The qualified name of the table containing the training set from which to build the model. database_name : str, optional Name of the database holding the training set table used to build the model. E.g., 'My Cluster Name'. file_id : int, optional If the training data are stored in a Civis file, provide the integer file ID. sql_where : str, optional A SQL WHERE clause used to scope the rows of the training set (used for table input only) sql_limit : int, optional SQL LIMIT clause for querying the training set (used for table input only) oos_scores : str, optional If provided, store out-of-sample predictions on training set data to this Redshift "schema.tablename". oos_scores_db : str, optional If not provided, store OOS predictions in the same database which holds the training data. if_exists : {'fail', 'append', 'drop', 'truncate'} Action to take if the out-of-sample prediction table already exists. fit_params: Dict[str, str] Mapping from parameter names in the model's ``fit`` method to the column names which hold the data, e.g. ``{'sample_weight': 'survey_weight_column'}``. polling_interval : float, optional Check for job completion every this number of seconds. Do not set if using the notifications endpoint. validation_data : str, optional Source for validation data. There are currently two options: `'train'` (the default), which cross-validates over training data for validation; and `'skip'`, which skips the validation step. n_jobs : int, optional Number of jobs to use for training and validation. Defaults to 4, which allows parallelization over the 4 cross validation folds. Increase n_jobs to parallelize over many hyperparameter combinations in grid search/hyperband, or decrease to use fewer computational resources at once. Returns ------- :class:`~civis.ml.ModelFuture` """ if ((table_name is None or database_name is None) and file_id is None and df is None and csv_path is None): raise ValueError('Provide a source of data.') if sum((bool(table_name and database_name), bool(file_id), df is not None, csv_path is not None)) > 1: raise ValueError('Provide a single source of data.') if df is not None: file_id = _stash_local_dataframe(df, client=self._client) elif csv_path: file_id = _stash_local_file(csv_path, client=self._client) train_args = {'TARGET_COLUMN': ' '.join(self.dependent_variable), 'PRIMARY_KEY': self.primary_key, 'PARAMS': json.dumps(self.parameters), 'CVPARAMS': json.dumps(self.cv_params), 'CALIBRATION': self.calibration, 'IF_EXISTS': if_exists} if oos_scores: train_args['OOSTABLE'] = oos_scores if oos_scores_db: oos_db_id = self._client.get_database_id(oos_scores_db) train_args['OOSDB'] = {'database': oos_db_id} if sql_where: train_args['WHERESQL'] = sql_where if sql_limit: train_args['LIMITSQL'] = sql_limit if self.excluded_columns: train_args['EXCLUDE_COLS'] = ' '.join(self.excluded_columns) if fit_params: train_args['FIT_PARAMS'] = json.dumps(fit_params) if self.dependencies: train_args['DEPENDENCIES'] = ' '.join(self.dependencies) if _NEWEST_CIVISML_VERSION: if validation_data: train_args['VALIDATION_DATA'] = validation_data if n_jobs: train_args['N_JOBS'] = n_jobs if HAS_SKLEARN and isinstance(self.model, BaseEstimator): try: tempdir = tempfile.mkdtemp() fout = os.path.join(tempdir, 'estimator.pkl') joblib.dump(self.model, fout, compress=3) with open(fout, 'rb') as _fout: n = self.model_name if self.model_name else "CivisML" estimator_file_id = cio.file_to_civis( _fout, 'Estimator for ' + n, client=self._client) self._input_model = self.model # Keep the estimator self.model = str(estimator_file_id) finally: shutil.rmtree(tempdir) train_args['MODEL'] = self.model if HAS_SKLEARN and _NEWEST_CIVISML_VERSION: if isinstance(self.etl, BaseEstimator): try: tempdir = tempfile.mkdtemp() fout = os.path.join(tempdir, 'ETL.pkl') joblib.dump(self.etl, fout, compress=3) with open(fout, 'rb') as _fout: etl_file_id = cio.file_to_civis( _fout, 'ETL Estimator', client=self._client) train_args['ETL'] = str(etl_file_id) finally: shutil.rmtree(tempdir) name = self.model_name + ' Train' if self.model_name else None # Clear the existing training result so we can make a new one. self.train_result_ = None result, container, run = self._create_custom_run( self.train_template_id, job_name=name, table_name=table_name, database_name=database_name, file_id=file_id, args=train_args, resources=self.job_resources, polling_interval=polling_interval) self.train_result_ = result return result def _create_custom_run(self, template_id, job_name=None, table_name=None, database_name=None, file_id=None, args=None, resources=None, polling_interval=None): # Handle int-like but non-Python-integer types such as np.int64 file_id = int(file_id) if file_id is not None else file_id script_arguments = {'TABLE_NAME': table_name, 'CIVIS_FILE_ID': file_id, 'DEBUG': self.verbose} if database_name: if template_id < 8000: # v0 jobs used a different database parameter script_arguments['DB_NAME'] = database_name else: db_id = self._client.get_database_id(database_name) script_arguments['DB'] = {'database': db_id} resources = resources or {} for key, value in resources.items(): if value: # Default resources are set on the template. Only # modify via arguments if users give a non-default value. script_arguments[key] = value if self.git_token_name: creds = find(self._client.credentials.list(), name=self.git_token_name, type='Custom') if len(creds) > 1: raise ValueError("Unique credential with name '{}' for " "remote git hosting service not found!" .format(self.git_token_name)) script_arguments['GIT_CRED'] = creds[0].id script_arguments.update(args or {}) container = self._client.scripts.post_custom( from_template_id=template_id, name=job_name, arguments=script_arguments, notifications=self.notifications) log.info('Created custom script %s.', container.id) run = self._client.scripts.post_custom_runs(container.id) log.debug('Started job %s, run %s.', container.id, run.id) train_kwargs = {} if self.train_result_ is not None: train_kwargs = {'train_job_id': self.train_result_.job_id, 'train_run_id': self.train_result_.run_id} fut = ModelFuture( container.id, run.id, client=self._client, polling_interval=polling_interval, poll_on_creation=False, **train_kwargs) return fut, container, run @property @_check_fit_initiated def state(self): return self.train_result_.state @property @_check_fit_initiated def estimator(self): return self.train_result_.estimator @_check_fit_initiated def predict(self, df=None, csv_path=None, table_name=None, database_name=None, manifest=None, file_id=None, sql_where=None, sql_limit=None, primary_key=SENTINEL, output_table=None, output_db=None, if_exists='fail', n_jobs=None, polling_interval=None, cpu=None, memory=None, disk_space=None): """Make predictions on a trained model Provide input through one of a :class:`~pandas.DataFrame` (``df``), a local CSV (``csv_path``), a Civis Table (``table_name`` and ``database_name``), a Civis File containing a CSV (``file_id``), or a Civis File containing a manifest file (``manifest``). A "manifest file" is JSON which specifies the location of many shards of the data to be used for prediction. A manifest file is the output of a Civis export job with ``force_multifile=True`` set, e.g. from :func:`civis.io.civis_to_multifile_csv`. Large Civis Tables (provided using ``table_name``) will automatically be exported to manifest files. Prediction outputs will always be stored as gzipped CSVs in one or more Civis Files. You can find a list of File ID numbers for output files at the "output_file_ids" key in the metadata returned by the prediction job. Provide an ``output_table`` (and optionally an ``output_db``, if it's different from ``database_name``) to copy these predictions into a Civis Table. Parameters ---------- df : pd.DataFrame, optional A :class:`~pandas.DataFrame` of data for prediction. The :class:`~pandas.DataFrame` will be uploaded to a Civis file so that CivisML can access it. Note that the index of the :class:`~pandas.DataFrame` will be ignored -- use ``df.reset_index()`` if you want your index column to be included with the data passed to CivisML. csv_path : str, optional The location of a CSV of data on the local disk. It will be uploaded to a Civis file. table_name : str, optional The qualified name of the table containing your data database_name : str, optional Name of the database holding the data, e.g., 'My Redshift Cluster'. manifest : int, optional ID for a manifest file stored as a Civis file. (Note: if the manifest is not a Civis Platform-specific manifest, like the one returned from :func:`civis.io.civis_to_multfile_csv`, this must be used in conjunction with table_name and database_name due to the need for column discovery via Redshift.) file_id : int, optional If the data are a CSV stored in a Civis file, provide the integer file ID. sql_where : str, optional A SQL WHERE clause used to scope the rows to be predicted sql_limit : int, optional SQL LIMIT clause to restrict the size of the prediction set primary_key : str, optional Primary key of the prediction table. Defaults to the primary key of the training data. Use ``None`` to indicate that the prediction data don't have a primary key column. output_table: str, optional The table in which to put the predictions. output_db : str, optional Database of the output table. Defaults to the database of the input table. if_exists : {'fail', 'append', 'drop', 'truncate'} Action to take if the prediction table already exists. n_jobs : int, optional Number of concurrent Platform jobs to use for multi-file / large table prediction. polling_interval : float, optional Check for job completion every this number of seconds. Do not set if using the notifications endpoint. cpu : int, optional CPU shares requested by the user for a single job. memory : int, optional RAM requested by the user for a single job. disk_space : float, optional disk space requested by the user for a single job. Returns ------- :class:`~civis.ml.ModelFuture` """ self.train_result_.result() # Blocks and raises training errors if ((table_name is None or database_name is None) and file_id is None and df is None and csv_path is None and manifest is None): raise ValueError('Provide a source of data.') if sum((bool(table_name and database_name) or (manifest is not None), bool(file_id), df is not None, csv_path is not None)) > 1: raise ValueError('Provide a single source of data.') if df is not None: file_id = _stash_local_dataframe(df, client=self._client) elif csv_path: file_id = _stash_local_file(csv_path, client=self._client) if primary_key is SENTINEL: primary_key = self.primary_key predict_args = {'TRAIN_JOB': self.train_result_.job_id, 'TRAIN_RUN': self.train_result_.run_id, 'PRIMARY_KEY': primary_key, 'IF_EXISTS': if_exists} if output_table: predict_args['OUTPUT_TABLE'] = output_table if output_db: if self.predict_template_id == 7021: # v0 jobs used a different database parameter predict_args['OUTPUT_DB'] = output_db else: output_db_id = self._client.get_database_id(output_db) predict_args['OUTPUT_DB'] = {'database': output_db_id} if manifest: predict_args['MANIFEST'] = manifest if sql_where: predict_args['WHERESQL'] = sql_where if sql_limit: predict_args['LIMITSQL'] = sql_limit if n_jobs: predict_args['N_JOBS'] = n_jobs if _NEWEST_CIVISML_VERSION: if cpu: predict_args['CPU'] = cpu if memory: predict_args['MEMORY'] = memory if disk_space: predict_args['DISK_SPACE'] = disk_space name = self.model_name + ' Predict' if self.model_name else None result, container, run = self._create_custom_run( self.predict_template_id, job_name=name, table_name=table_name, database_name=database_name, file_id=file_id, args=predict_args, polling_interval=polling_interval) return result