Example #1
0
def _stash_local_file(csv_path, client=None):
    """Store data in a temporary Civis File and return the file ID"""
    civis_fname = 'modelpipeline_data.csv'
    with open(csv_path) as _fin:
        file_id = cio.file_to_civis(_fin, name=civis_fname, client=client)

    return file_id
def _store_and_attach_metadata(client: civis.APIClient, metadata: dict,
                               metadata_paths: dict,
                               filename: str) -> Tuple[int, dict]:
    """
    Given an APIClient object, metadata read from DDL, a collection of keys
    and paths within the DDL metadata, and a filename, this function:
        (1) writes the cleaned metadata fields to a JSONValue object,
        (2) writes the raw metadata fields to a file object,
        (3) attaches both to the current script as outputs, and
        (4) returns the file_id of the raw metadata and the cleaned metadata
        as a dictionary.

    Parameters
    ----------
    client: APIClient
        An instance of civis.APIClient.
    metadata: dict
        The raw metadata read from DDL.
    metadata_paths: dict
        A dictionary of the paths used to clean the metadata read from DDL.
        This should be the value of ddl_metadata_paths in configs.constants.
    filename: str
        The name of the file to which raw metadata should be written.

    Returns
    -------
    Tuple[int, dict]:
        file_id (int) of the raw metadata stored in S3, and
        cleaned_metadata (dict)

    Side Effects
    ------------
    - Stores object passed to metadata argument as a .json file in S3
    - Attaches this .json file as a script output
    - Stores cleaned metadata object as a JSONValues object
    - Attaches this JSONValues object as a script output
    """

    with open(filename, "w") as f:
        json.dump(metadata, f)
    file_id = file_to_civis(
        buf=filename,
        name=filename,
        expires_at=None,
    )
    client.scripts.post_containers_runs_outputs(
        id=os.environ["CIVIS_JOB_ID"],
        run_id=os.environ["CIVIS_RUN_ID"],
        object_type="File",
        object_id=file_id,
    )

    cleaned_metadata = _parse_metadata(metadata=metadata, paths=metadata_paths)
    for key, value in cleaned_metadata.items():
        if key.lower().endswith("updated at"):
            value = datetime.fromtimestamp(value).strftime("%Y-%m-%d %H:%M")
        write_and_attach_jsonvalue(json_value=value, name=key, client=client)
    return file_id, cleaned_metadata
Example #3
0
def _stash_local_dataframe(df, client=None):
    """Store data in a temporary Civis File and return the file ID"""
    civis_fname = 'modelpipeline_data.csv'
    buf = six.BytesIO()
    if six.PY3:
        txt = io.TextIOWrapper(buf, encoding='utf-8')
    else:
        txt = buf
    df.to_csv(txt, encoding='utf-8', index=False)
    txt.flush()
    buf.seek(0)
    file_id = cio.file_to_civis(buf, name=civis_fname, client=client)

    return file_id
Example #4
0
def files_upload_cmd(path, name, expires_at):
    """Upload a local file to Civis and get back the File ID."""

    if name is None:
        name = os.path.basename(path)

    if expires_at is None:
        # Use the default in Civis platform (30 days).
        expires_kwarg = {}
    elif expires_at.lower() == "never":
        expires_kwarg = {"expires_at": None}
    else:
        expires_kwarg = {"expires_at": expires_at}

    with open(path, 'rb') as f:
        file_id = file_to_civis(f, name=name, **expires_kwarg)
    print(file_id)
Example #5
0
def _stash_local_dataframe(df, client=None):
    """Store data in a temporary Civis File and return the file ID"""
    # Standard dataframe indexes do not have a "levels" attribute,
    # but multiindexes do. Checking for this attribute means we don't
    # need to import pandas to do error handling here.
    if getattr(getattr(df, "index", None), "levels", None) is not None:
        raise TypeError("CivisML does not support multi-indexed data frames. "
                        "Try calling `.reset_index` on your data to convert "
                        "it into a CivisML-friendly format.")
    civis_fname = 'modelpipeline_data.csv'
    buf = six.BytesIO()
    if six.PY3:
        txt = io.TextIOWrapper(buf, encoding='utf-8')
    else:
        txt = buf
    df.to_csv(txt, encoding='utf-8', index=False)
    txt.flush()
    buf.seek(0)
    file_id = cio.file_to_civis(buf, name=civis_fname, client=client)

    return file_id
Example #6
0
def csv_to_civis(filename,
                 database,
                 table,
                 api_key=None,
                 client=None,
                 max_errors=None,
                 existing_table_rows="fail",
                 diststyle=None,
                 distkey=None,
                 sortkey1=None,
                 sortkey2=None,
                 delimiter=",",
                 headers=None,
                 credential_id=None,
                 polling_interval=None,
                 archive=False,
                 hidden=True):
    """Upload the contents of a local CSV file to Civis.

    Parameters
    ----------
    filename : str
        Upload the contents of this file.
    database : str or int
        Upload data into this database. Can be the database name or ID.
    table : str
        The schema and table you want to upload to. E.g.,
        ``'scratch.table'``.
    api_key : DEPRECATED str, optional
        Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY`
        environment variable will be used.
    client : :class:`civis.APIClient`, optional
        If not provided, an :class:`civis.APIClient` object will be
        created from the :envvar:`CIVIS_API_KEY`.
    max_errors : int, optional
        The maximum number of rows with errors to remove from the import
        before failing.
    existing_table_rows : str, optional
        The behaviour if a table with the requested name already exists.
        One of ``'fail'``, ``'truncate'``, ``'append'`` or ``'drop'``.
        Defaults to ``'fail'``.
    diststyle : str, optional
        The distribution style for the table.
        One of ``'even'``, ``'all'`` or ``'key'``.
    distkey : str, optional
        The column to use as the distkey for the table.
    sortkey1 : str, optional
        The column to use as the sortkey for the table.
    sortkey2 : str, optional
        The second column in a compound sortkey for the table.
    delimiter : string, optional
        The column delimiter. One of ``','``, ``'\\t'`` or ``'|'``.
    headers : bool, optional
        Whether or not the first row of the file should be treated as
        headers. The default, ``None``, attempts to autodetect whether
        or not the first row contains headers.
    credential_id : str or int, optional
        The ID of the database credential.  If ``None``, the default
        credential will be used.
    polling_interval : int or float, optional
        Number of seconds to wait between checks for job completion.
    archive : bool, optional (deprecated)
        If ``True``, archive the import job as soon as it completes.
    hidden : bool, optional
        If ``True`` (the default), this job will not appear in the Civis UI.

    Returns
    -------
    results : :class:`~civis.futures.CivisFuture`
        A `CivisFuture` object.

    Notes
    -----
    This reads the contents of `filename` into memory.

    Examples
    --------
    >>> with open('input_file.csv', 'w') as _input:
    ...     _input.write('a,b,c\\n1,2,3')
    >>> fut = civis.io.csv_to_civis('input_file.csv',
    ...                             'my-database',
    ...                             'scratch.my_data')
    >>> fut.result()
    """
    if client is None:
        client = APIClient(api_key=api_key, resources='all')
    if archive:
        warnings.warn(
            "`archive` is deprecated and will be removed in v2.0.0. "
            "Use `hidden` instead.", FutureWarning)

    name = path.basename(filename)
    with open(filename, "rb") as data:
        file_id = file_to_civis(data, name, client=client)
        log.debug('Uploaded file %s to Civis file %s', filename, file_id)
        fut = civis_file_to_table(file_id,
                                  database,
                                  table,
                                  client=client,
                                  max_errors=max_errors,
                                  existing_table_rows=existing_table_rows,
                                  diststyle=diststyle,
                                  distkey=distkey,
                                  sortkey1=sortkey1,
                                  sortkey2=sortkey2,
                                  delimiter=delimiter,
                                  headers=headers,
                                  credential_id=credential_id,
                                  polling_interval=polling_interval,
                                  hidden=hidden)
    return fut
Example #7
0
def dataframe_to_civis(df,
                       database,
                       table,
                       api_key=None,
                       client=None,
                       max_errors=None,
                       existing_table_rows="fail",
                       diststyle=None,
                       distkey=None,
                       sortkey1=None,
                       sortkey2=None,
                       headers=None,
                       credential_id=None,
                       polling_interval=None,
                       archive=False,
                       hidden=True,
                       **kwargs):
    """Upload a `pandas` `DataFrame` into a Civis table.

    The `DataFrame`'s index will not be included. To store the index
    along with the other values, use `df.reset_index()` instead
    of `df` as the first argument to this function.

    Parameters
    ----------
    df : :class:`pandas:pandas.DataFrame`
        The `DataFrame` to upload to Civis.
    database : str or int
        Upload data into this database. Can be the database name or ID.
    table : str
        The schema and table you want to upload to. E.g.,
        ``'scratch.table'``.
    api_key : DEPRECATED str, optional
        Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY`
        environment variable will be used.
    client : :class:`civis.APIClient`, optional
        If not provided, an :class:`civis.APIClient` object will be
        created from the :envvar:`CIVIS_API_KEY`.
    max_errors : int, optional
        The maximum number of rows with errors to remove from the import
        before failing.
    existing_table_rows : str, optional
        The behaviour if a table with the requested name already exists.
        One of ``'fail'``, ``'truncate'``, ``'append'`` or ``'drop'``.
        Defaults to ``'fail'``.
    diststyle : str, optional
        The distribution style for the table.
        One of ``'even'``, ``'all'`` or ``'key'``.
    distkey : str, optional
        The column to use as the distkey for the table.
    sortkey1 : str, optional
        The column to use as the sortkey for the table.
    sortkey2 : str, optional
        The second column in a compound sortkey for the table.
    headers : bool, optional
        Whether or not the first row of the file should be treated as
        headers. The default, ``None``, attempts to autodetect whether
        or not the first row contains headers.
    credential_id : str or int, optional
        The ID of the database credential.  If ``None``, the default
        credential will be used.
    polling_interval : int or float, optional
        Number of seconds to wait between checks for job completion.
    archive : bool, optional (deprecated)
        If ``True``, archive the import job as soon as it completes.
    hidden : bool, optional
        If ``True`` (the default), this job will not appear in the Civis UI.
    **kwargs : kwargs
        Extra keyword arguments will be passed to
        :meth:`pandas:pandas.DataFrame.to_csv`.

    Returns
    -------
    fut : :class:`~civis.futures.CivisFuture`
        A `CivisFuture` object.

    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
    >>> fut = civis.io.dataframe_to_civis(df, 'my-database',
    ...                                   'scratch.df_table')
    >>> fut.result()
    """
    if client is None:
        client = APIClient(api_key=api_key, resources='all')
    if archive:
        warnings.warn(
            "`archive` is deprecated and will be removed in v2.0.0. "
            "Use `hidden` instead.", FutureWarning)

    with TemporaryDirectory() as tmp_dir:
        tmp_path = os.path.join(tmp_dir, 'dataframe_to_civis.csv')
        df.to_csv(tmp_path, encoding='utf-8', index=False, **kwargs)
        name = table.split('.')[-1]
        file_id = file_to_civis(tmp_path, name, client=client)

    delimiter = ','
    fut = civis_file_to_table(file_id,
                              database,
                              table,
                              client=client,
                              max_errors=max_errors,
                              existing_table_rows=existing_table_rows,
                              diststyle=diststyle,
                              distkey=distkey,
                              sortkey1=sortkey1,
                              sortkey2=sortkey2,
                              delimiter=delimiter,
                              headers=headers,
                              credential_id=credential_id,
                              polling_interval=polling_interval,
                              hidden=hidden)

    return fut
Example #8
0
    def train(self, df=None, csv_path=None, table_name=None,
              database_name=None, file_id=None,
              sql_where=None, sql_limit=None, oos_scores=None,
              oos_scores_db=None, if_exists='fail', fit_params=None,
              polling_interval=None, validation_data='train', n_jobs=4):
        """Start a Civis Platform job to train your model

        Provide input through one of
        a :class:`~pandas.DataFrame` (``df``),
        a local CSV (``csv_path``),
        a Civis Table (``table_name`` and ``database_name``), or
        a Civis File containing a CSV (``file_id``).

        Model outputs will always contain out-of-sample scores
        (accessible through :attr:`ModelFuture.table` on this function's
        output), and you may chose to store these out-of-sample scores
        in a Civis Table with the ``oos_scores``, ``oos_scores_db``,
        and ``if_exists`` parameters.

        Parameters
        ----------
        df : pd.DataFrame, optional
            A :class:`~pandas.DataFrame` of training data.
            The :class:`~pandas.DataFrame` will be uploaded to a Civis file so
            that CivisML can access it.
            Note that the index of the :class:`~pandas.DataFrame` will be
            ignored -- use ``df.reset_index()`` if you want your
            index column to be included with the data passed to CivisML.
        csv_path : str, optional
            The location of a CSV of data on the local disk.
            It will be uploaded to a Civis file.
        table_name : str, optional
            The qualified name of the table containing the training set from
            which to build the model.
        database_name : str, optional
            Name of the database holding the training set table used to
            build the model. E.g., 'My Cluster Name'.
        file_id : int, optional
            If the training data are stored in a Civis file,
            provide the integer file ID.
        sql_where : str, optional
            A SQL WHERE clause used to scope the rows of the training set
            (used for table input only)
        sql_limit : int, optional
            SQL LIMIT clause for querying the training set
            (used for table input only)
        oos_scores : str, optional
            If provided, store out-of-sample predictions on
            training set data to this Redshift "schema.tablename".
        oos_scores_db : str, optional
            If not provided, store OOS predictions in the same database
            which holds the training data.
        if_exists : {'fail', 'append', 'drop', 'truncate'}
            Action to take if the out-of-sample prediction table
            already exists.
        fit_params: Dict[str, str]
            Mapping from parameter names in the model's ``fit`` method
            to the column names which hold the data, e.g.
            ``{'sample_weight': 'survey_weight_column'}``.
        polling_interval : float, optional
            Check for job completion every this number of seconds.
            Do not set if using the notifications endpoint.
        validation_data : str, optional
            Source for validation data. There are currently two options:
            `'train'` (the default), which cross-validates over training data
            for validation; and `'skip'`, which skips the validation step.
        n_jobs : int, optional
            Number of jobs to use for training and validation. Defaults to
            4, which allows parallelization over the 4 cross validation folds.
            Increase n_jobs to parallelize over many hyperparameter
            combinations in grid search/hyperband, or decrease to use fewer
            computational resources at once.

        Returns
        -------
        :class:`~civis.ml.ModelFuture`
        """
        if ((table_name is None or database_name is None) and
                file_id is None and df is None and csv_path is None):
            raise ValueError('Provide a source of data.')
        if sum((bool(table_name and database_name),
                bool(file_id), df is not None, csv_path is not None)) > 1:
            raise ValueError('Provide a single source of data.')

        if df is not None:
            file_id = _stash_local_dataframe(df, client=self._client)
        elif csv_path:
            file_id = _stash_local_file(csv_path, client=self._client)

        train_args = {'TARGET_COLUMN': ' '.join(self.dependent_variable),
                      'PRIMARY_KEY': self.primary_key,
                      'PARAMS': json.dumps(self.parameters),
                      'CVPARAMS': json.dumps(self.cv_params),
                      'CALIBRATION': self.calibration,
                      'IF_EXISTS': if_exists}
        if oos_scores:
            train_args['OOSTABLE'] = oos_scores
        if oos_scores_db:
            oos_db_id = self._client.get_database_id(oos_scores_db)
            train_args['OOSDB'] = {'database': oos_db_id}
        if sql_where:
            train_args['WHERESQL'] = sql_where
        if sql_limit:
            train_args['LIMITSQL'] = sql_limit
        if self.excluded_columns:
            train_args['EXCLUDE_COLS'] = ' '.join(self.excluded_columns)
        if fit_params:
            train_args['FIT_PARAMS'] = json.dumps(fit_params)
        if self.dependencies:
            train_args['DEPENDENCIES'] = ' '.join(self.dependencies)
        if _NEWEST_CIVISML_VERSION:
            if validation_data:
                train_args['VALIDATION_DATA'] = validation_data
            if n_jobs:
                train_args['N_JOBS'] = n_jobs

        if HAS_SKLEARN and isinstance(self.model, BaseEstimator):
            try:
                tempdir = tempfile.mkdtemp()
                fout = os.path.join(tempdir, 'estimator.pkl')
                joblib.dump(self.model, fout, compress=3)
                with open(fout, 'rb') as _fout:
                    n = self.model_name if self.model_name else "CivisML"
                    estimator_file_id = cio.file_to_civis(
                        _fout, 'Estimator for ' + n, client=self._client)
                self._input_model = self.model  # Keep the estimator
                self.model = str(estimator_file_id)
            finally:
                shutil.rmtree(tempdir)
        train_args['MODEL'] = self.model

        if HAS_SKLEARN and _NEWEST_CIVISML_VERSION:
            if isinstance(self.etl, BaseEstimator):
                try:
                    tempdir = tempfile.mkdtemp()
                    fout = os.path.join(tempdir, 'ETL.pkl')
                    joblib.dump(self.etl, fout, compress=3)
                    with open(fout, 'rb') as _fout:
                        etl_file_id = cio.file_to_civis(
                            _fout, 'ETL Estimator', client=self._client)
                    train_args['ETL'] = str(etl_file_id)
                finally:
                    shutil.rmtree(tempdir)

        name = self.model_name + ' Train' if self.model_name else None
        # Clear the existing training result so we can make a new one.
        self.train_result_ = None

        result, container, run = self._create_custom_run(
              self.train_template_id,
              job_name=name,
              table_name=table_name,
              database_name=database_name,
              file_id=file_id,
              args=train_args,
              resources=self.job_resources,
              polling_interval=polling_interval)

        self.train_result_ = result

        return result