def _stash_local_file(csv_path, client=None): """Store data in a temporary Civis File and return the file ID""" civis_fname = 'modelpipeline_data.csv' with open(csv_path) as _fin: file_id = cio.file_to_civis(_fin, name=civis_fname, client=client) return file_id
def _store_and_attach_metadata(client: civis.APIClient, metadata: dict, metadata_paths: dict, filename: str) -> Tuple[int, dict]: """ Given an APIClient object, metadata read from DDL, a collection of keys and paths within the DDL metadata, and a filename, this function: (1) writes the cleaned metadata fields to a JSONValue object, (2) writes the raw metadata fields to a file object, (3) attaches both to the current script as outputs, and (4) returns the file_id of the raw metadata and the cleaned metadata as a dictionary. Parameters ---------- client: APIClient An instance of civis.APIClient. metadata: dict The raw metadata read from DDL. metadata_paths: dict A dictionary of the paths used to clean the metadata read from DDL. This should be the value of ddl_metadata_paths in configs.constants. filename: str The name of the file to which raw metadata should be written. Returns ------- Tuple[int, dict]: file_id (int) of the raw metadata stored in S3, and cleaned_metadata (dict) Side Effects ------------ - Stores object passed to metadata argument as a .json file in S3 - Attaches this .json file as a script output - Stores cleaned metadata object as a JSONValues object - Attaches this JSONValues object as a script output """ with open(filename, "w") as f: json.dump(metadata, f) file_id = file_to_civis( buf=filename, name=filename, expires_at=None, ) client.scripts.post_containers_runs_outputs( id=os.environ["CIVIS_JOB_ID"], run_id=os.environ["CIVIS_RUN_ID"], object_type="File", object_id=file_id, ) cleaned_metadata = _parse_metadata(metadata=metadata, paths=metadata_paths) for key, value in cleaned_metadata.items(): if key.lower().endswith("updated at"): value = datetime.fromtimestamp(value).strftime("%Y-%m-%d %H:%M") write_and_attach_jsonvalue(json_value=value, name=key, client=client) return file_id, cleaned_metadata
def _stash_local_dataframe(df, client=None): """Store data in a temporary Civis File and return the file ID""" civis_fname = 'modelpipeline_data.csv' buf = six.BytesIO() if six.PY3: txt = io.TextIOWrapper(buf, encoding='utf-8') else: txt = buf df.to_csv(txt, encoding='utf-8', index=False) txt.flush() buf.seek(0) file_id = cio.file_to_civis(buf, name=civis_fname, client=client) return file_id
def files_upload_cmd(path, name, expires_at): """Upload a local file to Civis and get back the File ID.""" if name is None: name = os.path.basename(path) if expires_at is None: # Use the default in Civis platform (30 days). expires_kwarg = {} elif expires_at.lower() == "never": expires_kwarg = {"expires_at": None} else: expires_kwarg = {"expires_at": expires_at} with open(path, 'rb') as f: file_id = file_to_civis(f, name=name, **expires_kwarg) print(file_id)
def _stash_local_dataframe(df, client=None): """Store data in a temporary Civis File and return the file ID""" # Standard dataframe indexes do not have a "levels" attribute, # but multiindexes do. Checking for this attribute means we don't # need to import pandas to do error handling here. if getattr(getattr(df, "index", None), "levels", None) is not None: raise TypeError("CivisML does not support multi-indexed data frames. " "Try calling `.reset_index` on your data to convert " "it into a CivisML-friendly format.") civis_fname = 'modelpipeline_data.csv' buf = six.BytesIO() if six.PY3: txt = io.TextIOWrapper(buf, encoding='utf-8') else: txt = buf df.to_csv(txt, encoding='utf-8', index=False) txt.flush() buf.seek(0) file_id = cio.file_to_civis(buf, name=civis_fname, client=client) return file_id
def csv_to_civis(filename, database, table, api_key=None, client=None, max_errors=None, existing_table_rows="fail", diststyle=None, distkey=None, sortkey1=None, sortkey2=None, delimiter=",", headers=None, credential_id=None, polling_interval=None, archive=False, hidden=True): """Upload the contents of a local CSV file to Civis. Parameters ---------- filename : str Upload the contents of this file. database : str or int Upload data into this database. Can be the database name or ID. table : str The schema and table you want to upload to. E.g., ``'scratch.table'``. api_key : DEPRECATED str, optional Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY` environment variable will be used. client : :class:`civis.APIClient`, optional If not provided, an :class:`civis.APIClient` object will be created from the :envvar:`CIVIS_API_KEY`. max_errors : int, optional The maximum number of rows with errors to remove from the import before failing. existing_table_rows : str, optional The behaviour if a table with the requested name already exists. One of ``'fail'``, ``'truncate'``, ``'append'`` or ``'drop'``. Defaults to ``'fail'``. diststyle : str, optional The distribution style for the table. One of ``'even'``, ``'all'`` or ``'key'``. distkey : str, optional The column to use as the distkey for the table. sortkey1 : str, optional The column to use as the sortkey for the table. sortkey2 : str, optional The second column in a compound sortkey for the table. delimiter : string, optional The column delimiter. One of ``','``, ``'\\t'`` or ``'|'``. headers : bool, optional Whether or not the first row of the file should be treated as headers. The default, ``None``, attempts to autodetect whether or not the first row contains headers. credential_id : str or int, optional The ID of the database credential. If ``None``, the default credential will be used. polling_interval : int or float, optional Number of seconds to wait between checks for job completion. archive : bool, optional (deprecated) If ``True``, archive the import job as soon as it completes. hidden : bool, optional If ``True`` (the default), this job will not appear in the Civis UI. Returns ------- results : :class:`~civis.futures.CivisFuture` A `CivisFuture` object. Notes ----- This reads the contents of `filename` into memory. Examples -------- >>> with open('input_file.csv', 'w') as _input: ... _input.write('a,b,c\\n1,2,3') >>> fut = civis.io.csv_to_civis('input_file.csv', ... 'my-database', ... 'scratch.my_data') >>> fut.result() """ if client is None: client = APIClient(api_key=api_key, resources='all') if archive: warnings.warn( "`archive` is deprecated and will be removed in v2.0.0. " "Use `hidden` instead.", FutureWarning) name = path.basename(filename) with open(filename, "rb") as data: file_id = file_to_civis(data, name, client=client) log.debug('Uploaded file %s to Civis file %s', filename, file_id) fut = civis_file_to_table(file_id, database, table, client=client, max_errors=max_errors, existing_table_rows=existing_table_rows, diststyle=diststyle, distkey=distkey, sortkey1=sortkey1, sortkey2=sortkey2, delimiter=delimiter, headers=headers, credential_id=credential_id, polling_interval=polling_interval, hidden=hidden) return fut
def dataframe_to_civis(df, database, table, api_key=None, client=None, max_errors=None, existing_table_rows="fail", diststyle=None, distkey=None, sortkey1=None, sortkey2=None, headers=None, credential_id=None, polling_interval=None, archive=False, hidden=True, **kwargs): """Upload a `pandas` `DataFrame` into a Civis table. The `DataFrame`'s index will not be included. To store the index along with the other values, use `df.reset_index()` instead of `df` as the first argument to this function. Parameters ---------- df : :class:`pandas:pandas.DataFrame` The `DataFrame` to upload to Civis. database : str or int Upload data into this database. Can be the database name or ID. table : str The schema and table you want to upload to. E.g., ``'scratch.table'``. api_key : DEPRECATED str, optional Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY` environment variable will be used. client : :class:`civis.APIClient`, optional If not provided, an :class:`civis.APIClient` object will be created from the :envvar:`CIVIS_API_KEY`. max_errors : int, optional The maximum number of rows with errors to remove from the import before failing. existing_table_rows : str, optional The behaviour if a table with the requested name already exists. One of ``'fail'``, ``'truncate'``, ``'append'`` or ``'drop'``. Defaults to ``'fail'``. diststyle : str, optional The distribution style for the table. One of ``'even'``, ``'all'`` or ``'key'``. distkey : str, optional The column to use as the distkey for the table. sortkey1 : str, optional The column to use as the sortkey for the table. sortkey2 : str, optional The second column in a compound sortkey for the table. headers : bool, optional Whether or not the first row of the file should be treated as headers. The default, ``None``, attempts to autodetect whether or not the first row contains headers. credential_id : str or int, optional The ID of the database credential. If ``None``, the default credential will be used. polling_interval : int or float, optional Number of seconds to wait between checks for job completion. archive : bool, optional (deprecated) If ``True``, archive the import job as soon as it completes. hidden : bool, optional If ``True`` (the default), this job will not appear in the Civis UI. **kwargs : kwargs Extra keyword arguments will be passed to :meth:`pandas:pandas.DataFrame.to_csv`. Returns ------- fut : :class:`~civis.futures.CivisFuture` A `CivisFuture` object. Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) >>> fut = civis.io.dataframe_to_civis(df, 'my-database', ... 'scratch.df_table') >>> fut.result() """ if client is None: client = APIClient(api_key=api_key, resources='all') if archive: warnings.warn( "`archive` is deprecated and will be removed in v2.0.0. " "Use `hidden` instead.", FutureWarning) with TemporaryDirectory() as tmp_dir: tmp_path = os.path.join(tmp_dir, 'dataframe_to_civis.csv') df.to_csv(tmp_path, encoding='utf-8', index=False, **kwargs) name = table.split('.')[-1] file_id = file_to_civis(tmp_path, name, client=client) delimiter = ',' fut = civis_file_to_table(file_id, database, table, client=client, max_errors=max_errors, existing_table_rows=existing_table_rows, diststyle=diststyle, distkey=distkey, sortkey1=sortkey1, sortkey2=sortkey2, delimiter=delimiter, headers=headers, credential_id=credential_id, polling_interval=polling_interval, hidden=hidden) return fut
def train(self, df=None, csv_path=None, table_name=None, database_name=None, file_id=None, sql_where=None, sql_limit=None, oos_scores=None, oos_scores_db=None, if_exists='fail', fit_params=None, polling_interval=None, validation_data='train', n_jobs=4): """Start a Civis Platform job to train your model Provide input through one of a :class:`~pandas.DataFrame` (``df``), a local CSV (``csv_path``), a Civis Table (``table_name`` and ``database_name``), or a Civis File containing a CSV (``file_id``). Model outputs will always contain out-of-sample scores (accessible through :attr:`ModelFuture.table` on this function's output), and you may chose to store these out-of-sample scores in a Civis Table with the ``oos_scores``, ``oos_scores_db``, and ``if_exists`` parameters. Parameters ---------- df : pd.DataFrame, optional A :class:`~pandas.DataFrame` of training data. The :class:`~pandas.DataFrame` will be uploaded to a Civis file so that CivisML can access it. Note that the index of the :class:`~pandas.DataFrame` will be ignored -- use ``df.reset_index()`` if you want your index column to be included with the data passed to CivisML. csv_path : str, optional The location of a CSV of data on the local disk. It will be uploaded to a Civis file. table_name : str, optional The qualified name of the table containing the training set from which to build the model. database_name : str, optional Name of the database holding the training set table used to build the model. E.g., 'My Cluster Name'. file_id : int, optional If the training data are stored in a Civis file, provide the integer file ID. sql_where : str, optional A SQL WHERE clause used to scope the rows of the training set (used for table input only) sql_limit : int, optional SQL LIMIT clause for querying the training set (used for table input only) oos_scores : str, optional If provided, store out-of-sample predictions on training set data to this Redshift "schema.tablename". oos_scores_db : str, optional If not provided, store OOS predictions in the same database which holds the training data. if_exists : {'fail', 'append', 'drop', 'truncate'} Action to take if the out-of-sample prediction table already exists. fit_params: Dict[str, str] Mapping from parameter names in the model's ``fit`` method to the column names which hold the data, e.g. ``{'sample_weight': 'survey_weight_column'}``. polling_interval : float, optional Check for job completion every this number of seconds. Do not set if using the notifications endpoint. validation_data : str, optional Source for validation data. There are currently two options: `'train'` (the default), which cross-validates over training data for validation; and `'skip'`, which skips the validation step. n_jobs : int, optional Number of jobs to use for training and validation. Defaults to 4, which allows parallelization over the 4 cross validation folds. Increase n_jobs to parallelize over many hyperparameter combinations in grid search/hyperband, or decrease to use fewer computational resources at once. Returns ------- :class:`~civis.ml.ModelFuture` """ if ((table_name is None or database_name is None) and file_id is None and df is None and csv_path is None): raise ValueError('Provide a source of data.') if sum((bool(table_name and database_name), bool(file_id), df is not None, csv_path is not None)) > 1: raise ValueError('Provide a single source of data.') if df is not None: file_id = _stash_local_dataframe(df, client=self._client) elif csv_path: file_id = _stash_local_file(csv_path, client=self._client) train_args = {'TARGET_COLUMN': ' '.join(self.dependent_variable), 'PRIMARY_KEY': self.primary_key, 'PARAMS': json.dumps(self.parameters), 'CVPARAMS': json.dumps(self.cv_params), 'CALIBRATION': self.calibration, 'IF_EXISTS': if_exists} if oos_scores: train_args['OOSTABLE'] = oos_scores if oos_scores_db: oos_db_id = self._client.get_database_id(oos_scores_db) train_args['OOSDB'] = {'database': oos_db_id} if sql_where: train_args['WHERESQL'] = sql_where if sql_limit: train_args['LIMITSQL'] = sql_limit if self.excluded_columns: train_args['EXCLUDE_COLS'] = ' '.join(self.excluded_columns) if fit_params: train_args['FIT_PARAMS'] = json.dumps(fit_params) if self.dependencies: train_args['DEPENDENCIES'] = ' '.join(self.dependencies) if _NEWEST_CIVISML_VERSION: if validation_data: train_args['VALIDATION_DATA'] = validation_data if n_jobs: train_args['N_JOBS'] = n_jobs if HAS_SKLEARN and isinstance(self.model, BaseEstimator): try: tempdir = tempfile.mkdtemp() fout = os.path.join(tempdir, 'estimator.pkl') joblib.dump(self.model, fout, compress=3) with open(fout, 'rb') as _fout: n = self.model_name if self.model_name else "CivisML" estimator_file_id = cio.file_to_civis( _fout, 'Estimator for ' + n, client=self._client) self._input_model = self.model # Keep the estimator self.model = str(estimator_file_id) finally: shutil.rmtree(tempdir) train_args['MODEL'] = self.model if HAS_SKLEARN and _NEWEST_CIVISML_VERSION: if isinstance(self.etl, BaseEstimator): try: tempdir = tempfile.mkdtemp() fout = os.path.join(tempdir, 'ETL.pkl') joblib.dump(self.etl, fout, compress=3) with open(fout, 'rb') as _fout: etl_file_id = cio.file_to_civis( _fout, 'ETL Estimator', client=self._client) train_args['ETL'] = str(etl_file_id) finally: shutil.rmtree(tempdir) name = self.model_name + ' Train' if self.model_name else None # Clear the existing training result so we can make a new one. self.train_result_ = None result, container, run = self._create_custom_run( self.train_template_id, job_name=name, table_name=table_name, database_name=database_name, file_id=file_id, args=train_args, resources=self.job_resources, polling_interval=polling_interval) self.train_result_ = result return result