예제 #1
0
def query_civis(sql,
                database,
                api_key=None,
                client=None,
                credential_id=None,
                preview_rows=10,
                polling_interval=None,
                hidden=True):
    """Execute a SQL statement as a Civis query.

    Run a query that may return no results or where only a small
    preview is required. To execute a query that returns a large number
    of rows, see :func:`~civis.io.read_civis_sql`.

    Parameters
    ----------
    sql : str
        The SQL statement to execute.
    database : str or int
        The name or ID of the database.
    api_key : DEPRECATED str, optional
        Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY`
        environment variable will be used.
    client : :class:`civis.APIClient`, optional
        If not provided, an :class:`civis.APIClient` object will be
        created from the :envvar:`CIVIS_API_KEY`.
    credential_id : str or int, optional
        The ID of the database credential. If ``None``, the default
        credential will be used.
    preview_rows : int, optional
        The maximum number of rows to return. No more than 100 rows can be
        returned at once.
    polling_interval : int or float, optional
        Number of seconds to wait between checks for query completion.
    hidden : bool, optional
        If ``True`` (the default), this job will not appear in the Civis UI.

    Returns
    -------
    results : :class:`~civis.futures.CivisFuture`
        A `CivisFuture` object.

    Examples
    --------
    >>> run = query_civis(sql="DELETE schema.table", database='database')
    >>> run.result()  # Wait for query to complete
    """
    if client is None:
        client = APIClient(api_key=api_key)
    database_id = client.get_database_id(database)
    cred_id = credential_id or client.default_credential
    resp = client.queries.post(database_id,
                               sql,
                               preview_rows,
                               credential=cred_id,
                               hidden=hidden)
    return CivisFuture(client.queries.get, (resp.id, ),
                       polling_interval,
                       client=client,
                       poll_on_creation=False)
예제 #2
0
def query_civis(sql,
                database,
                api_key=None,
                credential_id=None,
                preview_rows=10,
                polling_interval=_DEFAULT_POLLING_INTERVAL):
    """Execute a SQL statement as a Civis query.

    Run a query that may return no results or where only a small
    preview is required. To execute a query that returns a large number
    of rows, see :func:`~civis.io.read_civis_sql`.

    Parameters
    ----------
    sql : str
        The SQL statement to execute.
    database : str or int
        The name or ID of the database.
    api_key : str, optional
        Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY`
        environment variable will be used.
    credential_id : str or int, optional
        The ID of the database credential. If ``None``, the default
        credential will be used.
    preview_rows : int, optional
        The maximum number of rows to return. No more than 100 rows can be
        returned at once.
    polling_interval : int or float, optional
        Number of seconds to wait between checks for query completion.

    Returns
    -------
    results : :class:`~civis.polling.PollableResult`
        A `PollableResult` object.

    Examples
    --------
    >>> run = query_civis(sql="DELETE schema.table", database='database')
    >>> run.result()  # Wait for query to complete
    """
    client = APIClient(api_key=api_key)
    database_id = client.get_database_id(database)
    cred_id = credential_id or client.default_credential
    resp = client.queries.post(database_id,
                               sql,
                               preview_rows,
                               credential=cred_id)
    return PollableResult(client.queries.get, (resp.id, ), polling_interval)
예제 #3
0
def _import_bytes(buf, database, table, api_key, max_errors,
                  existing_table_rows, distkey, sortkey1, sortkey2, delimiter,
                  headers, credential_id, polling_interval, archive, hidden):
    client = APIClient(api_key=api_key)
    schema, table = table.split(".", 1)
    db_id = client.get_database_id(database)
    cred_id = credential_id or client.default_credential
    delimiter = DELIMITERS.get(delimiter)
    assert delimiter, "delimiter must be one of {}".format(DELIMITERS.keys())

    kwargs = dict(schema=schema,
                  name=table,
                  remote_host_id=db_id,
                  credential_id=cred_id,
                  max_errors=max_errors,
                  existing_table_rows=existing_table_rows,
                  distkey=distkey,
                  sortkey1=sortkey1,
                  sortkey2=sortkey2,
                  column_delimiter=delimiter,
                  first_row_is_header=headers,
                  hidden=hidden)

    import_job = client.imports.post_files(**kwargs)
    put_response = requests.put(import_job.upload_uri, buf)

    put_response.raise_for_status()
    run_job_result = client._session.post(import_job.run_uri)
    run_job_result.raise_for_status()
    run_info = run_job_result.json()
    fut = CivisFuture(client.imports.get_files_runs,
                      (run_info['importId'], run_info['id']),
                      polling_interval=polling_interval,
                      api_key=api_key,
                      poll_on_creation=False)
    if archive:

        def f(x):
            return client.imports.put_archive(import_job.id, True)

        fut.add_done_callback(f)
    return fut
예제 #4
0
def transfer_table(source_db,
                   dest_db,
                   source_table,
                   dest_table,
                   job_name=None,
                   api_key=None,
                   source_credential_id=None,
                   dest_credential_id=None,
                   polling_interval=_DEFAULT_POLLING_INTERVAL,
                   **advanced_options):
    """Transfer a table from one location to another.

    Parameters
    ----------
    source_db : str or int
        The name of the database where the source table is located.
        Optionally, could be the database ID.
    dest_db : str or int
        The name of the database where the table will be transfered.
        Optionally, could be the database ID.
    source_table : str
        Full name of the table to transfer, e.g., ``'schema.table'``.
    dest_table : str
        Full name of the table in the destination database, e.g.,
        ``'schema.table'``.
    job_name : str, optional
        A name to give the job. If omitted, a random job name will be
        used.
    api_key : str, optional
        Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY`
        environment variable will be used.
    source_credential_id : str or int, optional
        Optional credential ID for the source database. If ``None``, the
        default credential will be used.
    dest_credential_id : str or int, optional
        Optional credential ID for the destination database. If ``None``,
        the default credential will be used.
    polling_interval : int or float, optional
        Number of seconds to wait between checks for job completion.
    **advanced_options : kwargs
        Extra keyword arguments will be passed to the import sync job. See
        :func:`~civis.resources._resources.Imports.post_syncs`.

    Returns
    -------
    results : :class:`~civis.polling.PollableResult`
        A `PollableResult` object.

    Examples
    --------
    >>> transfer_table(source_db='Cluster A', dest_db='Cluster B',
    ...                source_table='schma.tbl', dest_table='schma.tbl')
    """
    client = APIClient(api_key=api_key)
    source_cred_id = source_credential_id or client.default_credential
    dest_cred_id = dest_credential_id or client.default_credential
    job_name = maybe_get_random_name(job_name)
    source = {
        'remote_host_id': client.get_database_id(source_db),
        'credential_id': source_cred_id
    }
    destination = {
        'remote_host_id': client.get_database_id(dest_db),
        'credential_id': dest_cred_id
    }
    job_id = client.imports.post(job_name,
                                 "Dbsync",
                                 True,
                                 source=source,
                                 destination=destination).id

    client.imports.post_syncs(id=job_id,
                              source={'path': source_table},
                              destination={'path': dest_table},
                              advanced_options=advanced_options)
    run_id = client.imports.post_runs(id=job_id).run_id

    poll = PollableResult(client.imports.get_files_runs, (job_id, run_id),
                          polling_interval)
    return poll
예제 #5
0
def civis_file_to_table(file_id,
                        database,
                        table,
                        client=None,
                        max_errors=None,
                        existing_table_rows="fail",
                        diststyle=None,
                        distkey=None,
                        sortkey1=None,
                        sortkey2=None,
                        delimiter=",",
                        headers=None,
                        credential_id=None,
                        polling_interval=None,
                        hidden=True):
    """Upload the contents of a Civis file to a Civis table.

    Parameters
    ----------
    file_id : int
        Civis file ID.
    database : str or int
        Upload data into this database. Can be the database name or ID.
    table : str
        The schema and table you want to upload to. E.g.,
        ``'scratch.table'``.
    client : :class:`civis.APIClient`, optional
        If not provided, an :class:`civis.APIClient` object will be
        created from the :envvar:`CIVIS_API_KEY`.
    max_errors : int, optional
        The maximum number of rows with errors to remove from the import
        before failing.
    existing_table_rows : str, optional
        The behaviour if a table with the requested name already exists.
        One of ``'fail'``, ``'truncate'``, ``'append'`` or ``'drop'``.
        Defaults to ``'fail'``.
    diststyle : str, optional
        The distribution style for the table.
        One of ``'even'``, ``'all'`` or ``'key'``.
    distkey : str, optional
        The column to use as the distkey for the table.
    sortkey1 : str, optional
        The column to use as the sortkey for the table.
    sortkey2 : str, optional
        The second column in a compound sortkey for the table.
    delimiter : string, optional
        The column delimiter. One of ``','``, ``'\\t'`` or ``'|'``.
    headers : bool, optional
        Whether or not the first row of the file should be treated as
        headers. The default, ``None``, attempts to autodetect whether
        or not the first row contains headers.
    credential_id : str or int, optional
        The ID of the database credential.  If ``None``, the default
        credential will be used.
    polling_interval : int or float, optional
        Number of seconds to wait between checks for job completion.
    hidden : bool, optional
        If ``True`` (the default), this job will not appear in the Civis UI.

    Returns
    -------
    results : :class:`~civis.futures.CivisFuture`
        A `CivisFuture` object.

    Examples
    --------
    >>> file_id = 100
    >>> fut = civis.io.civis_file_to_table(file_id,
    ...                                    'my-database',
    ...                                    'scratch.my_data')
    >>> fut.result()
    """
    if client is None:
        client = APIClient(resources='all')

    schema, table = table.split(".", 1)
    db_id = client.get_database_id(database)
    cred_id = credential_id or client.default_credential
    delimiter = DELIMITERS.get(delimiter)
    assert delimiter, "delimiter must be one of {}".format(DELIMITERS.keys())

    destination = dict(remote_host_id=db_id, credential_id=cred_id)
    import_name = 'CSV import to {}.{}'.format(schema, table)
    import_job = client.imports.post(import_name,
                                     'AutoImport',
                                     is_outbound=False,
                                     destination=destination,
                                     hidden=hidden)

    options = dict(max_errors=max_errors,
                   existing_table_rows=existing_table_rows,
                   diststyle=diststyle,
                   distkey=distkey,
                   sortkey1=sortkey1,
                   sortkey2=sortkey2,
                   column_delimiter=delimiter,
                   first_row_is_header=headers)

    client.imports.post_syncs(
        import_job.id,
        source=dict(file=dict(id=file_id)),
        destination=dict(database_table=dict(schema=schema, table=table)),
        advanced_options=options)

    run = client.jobs.post_runs(import_job.id)
    fut = CivisFuture(client.jobs.get_runs, (import_job.id, run['id']),
                      polling_interval=polling_interval,
                      client=client,
                      poll_on_creation=False)

    return fut
예제 #6
0
def civis_to_csv(filename,
                 sql,
                 database,
                 job_name=None,
                 api_key=None,
                 client=None,
                 credential_id=None,
                 include_header=True,
                 compression='none',
                 delimiter=',',
                 unquoted=False,
                 archive=False,
                 hidden=True,
                 polling_interval=None):
    """Export data from Civis to a local CSV file.

    The custom SQL string will be executed twice; once to attempt to
    retrieve headers and once to retrieve the data. This is done to
    use a more performant method for retrieving the data. The first
    execution of the custom SQL is controlled such that changes in
    state cannot occur (e.g., INSERT, UPDATE, DELETE, etc.).

    Parameters
    ----------
    filename : str
        Download exported data into this file.
    sql : str, optional
        The SQL select string to be executed.
    database : str or int
        Export data from this database. Can be the database name or ID.
    job_name : str, optional
        A name to give the job. If omitted, a random job name will be
        used.
    api_key : DEPRECATED str, optional
        Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY`
        environment variable will be used.
    client : :class:`civis.APIClient`, optional
        If not provided, an :class:`civis.APIClient` object will be
        created from the :envvar:`CIVIS_API_KEY`.
    credential_id : str or int, optional
        The ID of the database credential.  If ``None``, the default
        credential will be used.
    include_header: bool, optional
        If ``True``, the first line of the CSV will be headers.
        Default: ``True``.
    compression: str, optional
        Type of compression to use, if any. One of ``'none'``, ``'zip'``, or
        ``'gzip'``. Default ``'none'``. ``'gzip'`` currently returns a file
        with no compression unless include_header is set to False. In a
        future release, a ``'gzip'`` compressed file will be returned for
        all cases.
    delimiter: str, optional
        Which delimiter to use, if any. One of ``','``, ``'\t'``, or
        ``'|'``. Default: ``','``.
    unquoted: bool, optional
        Whether or not to quote fields. Default: ``False``.
    polling_interval : int or float, optional
        Number of seconds to wait between checks for query completion.
    archive : bool, optional (deprecated)
        If ``True``, archive the import job as soon as it completes.
    hidden : bool, optional
        If ``True`` (the default), this job will not appear in the Civis UI.

    Returns
    -------
    results : :class:`~civis.futures.CivisFuture`
        A `CivisFuture` object.

    Examples
    --------
    >>> sql = "SELECT * FROM schema.table"
    >>> fut = civis_to_csv("file.csv", sql, "my_database")
    >>> fut.result()  # Wait for job to complete

    See Also
    --------
    civis.io.read_civis : Read table contents into memory.
    civis.io.read_civis_sql : Read results of a SQL query into memory.
    """
    if archive:
        warnings.warn(
            "`archive` is deprecated and will be removed in v2.0.0. "
            "Use `hidden` instead.", FutureWarning)
    if client is None:
        client = APIClient(api_key=api_key, resources='all')

    db_id = client.get_database_id(database)
    credential_id = credential_id or client.default_credential

    # don't fix bug that would cause breaking change for now
    # when gzip compression is requested, a gzip file is not actually returned
    # instead the gzip file is decompressed during download
    if compression == 'gzip' and include_header:
        compression = 'none'

    # don't support parallel unload; the output format
    # is different which would introduce a breaking change
    headers = b''

    delimiter = DELIMITERS.get(delimiter)
    if not delimiter:
        raise ValueError("delimiter must be one of {}".format(
            DELIMITERS.keys()))

    # always set compression to gzip to reduce I/O
    csv_settings = dict(include_header=include_header,
                        compression='gzip',
                        column_delimiter=delimiter,
                        unquoted=unquoted,
                        filename_prefix=None,
                        force_multifile=False)

    script_id, run_id = _sql_script(client,
                                    sql,
                                    db_id,
                                    job_name,
                                    credential_id,
                                    hidden=hidden,
                                    csv_settings=csv_settings)
    fut = CivisFuture(client.scripts.get_sql_runs, (script_id, run_id),
                      polling_interval=polling_interval,
                      client=client,
                      poll_on_creation=False)
    download = _download_callback(script_id, run_id, filename, headers,
                                  compression)
    fut.add_done_callback(download)
    if archive:

        def f(x):
            return client.scripts.put_sql_archive(script_id, True)

        fut.add_done_callback(f)

    return fut
예제 #7
0
def read_civis_sql(sql,
                   database,
                   use_pandas=False,
                   job_name=None,
                   api_key=None,
                   client=None,
                   credential_id=None,
                   polling_interval=None,
                   archive=False,
                   hidden=True,
                   **kwargs):
    """Read data from Civis using a custom SQL string.

    The custom SQL string will be executed twice; once to attempt to
    retrieve headers and once to retrieve the data. This is done to
    use a more performant method for retrieving the data. The first
    execution of the custom SQL is controlled such that changes in
    state cannot occur (e.g., INSERT, UPDATE, DELETE, etc.).

    Parameters
    ----------
    sql : str, optional
        The SQL select string to be executed.
    database : str or int
        Execute the query against this database. Can be the database name
        or ID.
    use_pandas : bool, optional
        If ``True``, return a :class:`pandas:pandas.DataFrame`. Otherwise,
        return a list of results from :func:`python:csv.reader`.
    job_name : str, optional
        A name to give the job. If omitted, a random job name will be
        used.
    api_key : DEPRECATED str, optional
        Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY`
        environment variable will be used.
    client : :class:`civis.APIClient`, optional
        If not provided, an :class:`civis.APIClient` object will be
        created from the :envvar:`CIVIS_API_KEY`.
    credential_id : str or int, optional
        The database credential ID.  If ``None``, the default credential
        will be used.
    polling_interval : int or float, optional
        Number of seconds to wait between checks for query completion.
    archive : bool, optional (deprecated)
        If ``True``, archive the import job as soon as it completes.
    hidden : bool, optional
        If ``True`` (the default), this job will not appear in the Civis UI.
    **kwargs : kwargs
        Extra keyword arguments are passed into
        :func:`pandas:pandas.read_csv` if `use_pandas` is ``True`` or
        passed into :func:`python:csv.reader` if `use_pandas` is
        ``False``.

    Returns
    -------
    data : :class:`pandas:pandas.DataFrame` or list
        A list of rows (with header as first row) if `use_pandas` is
        ``False``, otherwise a `pandas` `DataFrame`. Note that if
        `use_pandas` is ``False``, no parsing of types is performed and
        each row will be a list of strings.

    Raises
    ------
    ImportError
        If `use_pandas` is ``True`` and `pandas` is not installed.

    Examples
    --------
    >>> sql = "SELECT * FROM schema.table"
    >>> df = read_civis_sql(sql, "my_database", use_pandas=True)
    >>> col_a = df["column_a"]

    >>> data = read_civis_sql(sql, "my_database")
    >>> columns = data.pop(0)
    >>> col_a_index = columns.index("column_a")
    >>> col_a = [row[col_a_index] for row in data]

    Notes
    -----
    This reads the data into memory.

    See Also
    --------
    civis.io.read_civis : Read directly into memory without SQL.
    civis.io.civis_to_csv : Write directly to a CSV file.
    """
    if client is None:
        client = APIClient(api_key=api_key, resources='all')
    if use_pandas and NO_PANDAS:
        raise ImportError("use_pandas is True but pandas is not installed.")
    if archive:
        warnings.warn(
            "`archive` is deprecated and will be removed in v2.0.0. "
            "Use `hidden` instead.", FutureWarning)

    db_id = client.get_database_id(database)
    credential_id = credential_id or client.default_credential

    # determine if we can request headers separately; if we can then Platform
    # will perform a parallel unload which is significantly more performant
    # we start by assuming headers are requested
    ovrd_include_header, headers = _include_header(client, sql, True, db_id,
                                                   credential_id,
                                                   polling_interval)

    # if we retrieved headers then we are performing a parallel unload
    # in which case we need to specify backslash as the escapechar
    if headers is not None:
        kwargs['escapechar'] = '\\'

    csv_settings = dict(include_header=ovrd_include_header, compression='gzip')

    script_id, run_id = _sql_script(client,
                                    sql,
                                    db_id,
                                    job_name,
                                    credential_id,
                                    csv_settings=csv_settings,
                                    hidden=hidden)
    fut = CivisFuture(client.scripts.get_sql_runs, (script_id, run_id),
                      polling_interval=polling_interval,
                      client=client,
                      poll_on_creation=False)
    if archive:

        def f(x):
            return client.scripts.put_sql_archive(script_id, True)

        fut.add_done_callback(f)
    fut.result()
    outputs = client.scripts.get_sql_runs(script_id, run_id)["output"]
    if not outputs:
        raise EmptyResultError(
            "Query {} returned no output.".format(script_id))

    url = outputs[0]["path"]
    file_id = outputs[0]["file_id"]
    log.debug('Exported results to Civis file %s (%s)',
              outputs[0]["output_name"], file_id)

    if use_pandas:
        # allows users to enter their own names parameter
        _kwargs = {'names': headers}
        _kwargs.update(kwargs)
        _kwargs['compression'] = 'gzip'

        data = pd.read_csv(url, **_kwargs)
    else:
        response = requests.get(url, stream=True)
        response.raise_for_status()

        with StringIO() as buf:
            if headers:
                buf.write(','.join(headers) + '\n')
            _decompress_stream(response, buf, write_bytes=False)
            buf.seek(0)
            data = list(csv.reader(buf, **kwargs))

    return data
예제 #8
0
def csv_to_civis(filename,
                 database,
                 table,
                 api_key=None,
                 max_errors=None,
                 existing_table_rows="fail",
                 distkey=None,
                 sortkey1=None,
                 sortkey2=None,
                 delimiter=",",
                 headers=None,
                 credential_id=None,
                 polling_interval=_DEFAULT_POLLING_INTERVAL,
                 archive=True):
    """Upload the contents of a local CSV file to Civis.

    Parameters
    ----------
    filename : str
        Upload the contents of this file.
    database : str or int
        Upload data into this database. Can be the database name or ID.
    table : str
        The schema and table you want to upload to. E.g.,
        ``'scratch.table'``.
    api_key : str, optional
        Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY`
        environment variable will be used.
    max_errors : int, optional
        The maximum number of rows with errors to remove from the import
        before failing.
    existing_table_rows : str, optional
        The behaviour if a table with the requested name already exists.
        One of ``'fail'``, ``'truncate'`` or ``'append'``. Defaults to
        ``'fail'``.
    distkey : str, optional
        The column to use as the distkey for the table.
    sortkey1 : str, optional
        The column to use as the sortkey for the table.
    sortkey2 : str, optional
        The second column in a compound sortkey for the table.
    delimiter : string, optional
        The column delimiter. One of ``','``, ``'\\t'`` or ``'|'``.
    headers : bool, optional
        Whether or not the first row of the file should be treated as
        headers. The default, ``None``, attempts to autodetect whether
        or not the first row contains headers.
    credential_id : str or int, optional
        The ID of the database credential.  If ``None``, the default
        credential will be used.
    polling_interval : int or float, optional
        Number of seconds to wait between checks for job completion.
    archive : bool, optional
        If ``True`` (the default), archive the import job as soon as it
        completes.

    Returns
    -------
    results : :class:`~civis.polling.PollableResult`
        A `PollableResult` object.

    Notes
    -----
    This reads the contents of `filename` into memory.

    Examples
    --------
    >>> with open('input_file.csv', 'w') as _input:
    ...     _input.write('a,b,c\\n1,2,3')
    >>> poller = civis.io.csv_to_civis('input_file.csv',
    ...                                'my-database',
    ...                                'scratch.my_data')
    >>> poller.result()
    """
    client = APIClient(api_key=api_key)
    schema, table = table.split(".", 1)
    db_id = client.get_database_id(database)
    cred_id = credential_id or client.default_credential
    delimiter = DELIMITERS.get(delimiter)
    assert delimiter, "delimiter must be one of {}".format(DELIMITERS.keys())

    kwargs = dict(schema=schema,
                  name=table,
                  remote_host_id=db_id,
                  credential_id=cred_id,
                  max_errors=max_errors,
                  existing_table_rows=existing_table_rows,
                  distkey=distkey,
                  sortkey1=sortkey1,
                  sortkey2=sortkey2,
                  column_delimiter=delimiter,
                  first_row_is_header=headers)

    import_job = client.imports.post_files(**kwargs)
    with open(filename, "rb") as data:
        put_response = requests.put(import_job.upload_uri, data)
    put_response.raise_for_status()
    run_job_result = client._session.post(import_job.run_uri)
    run_job_result.raise_for_status()
    run_info = run_job_result.json()
    poll = PollableResult(client.imports.get_files_runs,
                          (run_info['importId'], run_info['id']),
                          polling_interval=polling_interval)
    if archive:

        def f(x):
            return client.imports.put_archive(import_job.id, True)

        poll.add_done_callback(f)
    return poll
예제 #9
0
class ModelPipeline:
    """Interface for scikit-learn modeling in the Civis Platform

    Each ModelPipeline corresponds to a scikit-learn
    :class:`~sklearn.pipeline.Pipeline` which will run in Civis Platform.

    Note that this object can be safely pickled and unpickled, but it
    does not store the state of any attached :class:`~civis.APIClient` object.
    An unpickled ModelPipeline will use the API key from the user's
    environment.

    Parameters
    ----------
    model : string or Estimator
        Either the name of a pre-defined model
        (e.g. "sparse_logistic" or "gradient_boosting_classifier")
        or else a pre-existing Estimator object.
    dependent_variable : string or List[str]
        The dependent variable of the training dataset.
        For a multi-target problem, this should be a list of
        column names of dependent variables.
    primary_key : string, optional
        The unique ID (primary key) of the training dataset.
        This will be used to index the out-of-sample scores.
    parameters : dict, optional
        Specify parameters for the final stage estimator in a
        predefined model, e.g. ``{'C': 2}`` for a "sparse_logistic"
        model.
    cross_validation_parameters : dict, optional
        Cross validation parameter grid for learner parameters, e.g.
        ``{{'n_estimators': [100, 200, 500], 'learning_rate': [0.01, 0.1],
        'max_depth': [2, 3]}}``.
    model_name : string, optional
        The prefix of the Platform modeling jobs. It will have
        " Train" or " Predict" added to become the Script title.
    calibration : {None, "sigmoid", "isotonic"}
        If not None, calibrate output probabilities with the selected method.
        Valid only with classification models.
    excluded_columns : array, optional
        A list of columns which will be considered ineligible to be
        independent variables.
    client : :class:`~civis.APIClient`, optional
        If not provided, an :class:`~civis.APIClient` object will be
        created from the :envvar:`CIVIS_API_KEY`.
    cpu_requested : int, optional
        Number of CPU shares requested in the Civis Platform for
        training jobs. 1024 shares = 1 CPU.
    memory_requested : int, optional
        Memory requested from Civis Platform for training jobs, in MiB
    disk_requested : float, optional
        Disk space requested on Civis Platform for training jobs, in GB
    notifications : dict
        See :func:`~civis.resources._resources.Scripts.post_custom` for
        further documentation about email and URL notification.
    dependencies : array, optional
        List of packages to install from PyPI or git repository (i.e., Github
        or Bitbucket). If a private repo is specified, please include a
        ``git_token_name`` argument as well (see below). Make sure to pin
        dependencies to a specific version, since dependecies will be
        reinstalled during every training and predict job.
    git_token_name : str, optional
        Name of remote git API token stored in Civis Platform as the password
        field in a custom platform credential. Used only when installing
        private git repositories.
    verbose : bool, optional
        If True, supply debug outputs in Platform logs and make
        prediction child jobs visible.
    etl : Estimator, optional
        Custom ETL estimator which overrides the default ETL, and
        is run before training and validation.

    Methods
    -------
    train()
        Train the model on data in Civis Platform; outputs
        :class:`~civis.ml.ModelFuture`
    predict()
        Make predictions on new data; outputs :class:`~civis.ml.ModelFuture`
    from_existing()
        Class method; use to create a :class:`~civis.ml.ModelPipeline`
        from an existing model training run

    Attributes
    ----------
    estimator : :class:`~sklearn.pipeline.Pipeline`
        The trained scikit-learn Pipeline
    train_result_ : :class:`~civis.ml.ModelFuture`
        :class:`~civis.ml.ModelFuture` encapsulating this model's training run
    state : str
        Status of the training job (non-blocking)

    Examples
    --------
    >>> from civis.ml import ModelPipeline
    >>> model = ModelPipeline('gradient_boosting_classifier', 'depvar',
    ...                       primary_key='voterbase_id')
    >>> train = model.train(table_name='schema.survey_data',
    ...                     fit_params={'sample_weight': 'survey_weight'},
    ...                     database_name='My Redshift Cluster',
    ...                     oos_scores='scratch.survey_depvar_oos_scores')
    >>> train
    <ModelFuture at 0x11be7ae10 state=queued>
    >>> train.running()
    True
    >>> train.done()
    False
    >>> df = train.table  # Read OOS scores from its Civis File. Blocking.
    >>> meta = train.metadata  # Metadata from training run
    >>> train.metrics['roc_auc']
    0.88425
    >>> pred = model.predict(table_name='schema.demographics_table ',
    ...                      database_name='My Redshift Cluster',
    ...                      output_table='schema.predicted_survey_response',
    ...                      if_exists='drop',
    ...                      n_jobs=50)
    >>> df_pred = pred.table  # Blocks until finished
    # Modify the parameters of the base estimator in a default model:
    >>> model = ModelPipeline('sparse_logistic', 'depvar',
    ...                       primary_key='voterbase_id',
    ...                       parameters={'C': 2})
    # Grid search over hyperparameters in the base estimator:
    >>> model = ModelPipeline('sparse_logistic', 'depvar',
    ...                       primary_key='voterbase_id',
    ...                       cross_validation_parameters={'C': [0.1, 1, 10]})

    See Also
    --------
    civis.ml.ModelFuture
    """
    # These are the v2.0 templates
    train_template_id = 9968
    predict_template_id = 9969
    # These are the v1.1 templates
    _train_template_id_fallback = 9112
    _predict_template_id_fallback = 9113

    def _set_template_version(self, client):
        """Determine which version of CivisML to use. If the user
        has access to the newest templates, use them, otherwise
        fall back to the previous version. Used for internal or limited
        releases of new CivisML versions."""
        if '_NEWEST_CIVISML_VERSION' not in globals():
            global _NEWEST_CIVISML_VERSION
            try:
                newest_train = max(_PRED_TEMPLATES.keys())
                # Check that we can access the newest templates
                client.templates.get_scripts(id=newest_train)
                client.templates.get_scripts(id=_PRED_TEMPLATES[newest_train])
            except CivisAPIError:
                _NEWEST_CIVISML_VERSION = False
            else:
                _NEWEST_CIVISML_VERSION = True

    def __init__(self, model, dependent_variable,
                 primary_key=None, parameters=None,
                 cross_validation_parameters=None, model_name=None,
                 calibration=None, excluded_columns=None, client=None,
                 cpu_requested=None, memory_requested=None,
                 disk_requested=None, notifications=None,
                 dependencies=None, git_token_name=None, verbose=False,
                 etl=None):
        self.model = model
        self._input_model = model  # In case we need to modify the input
        if isinstance(dependent_variable, str):
            # Standardize the dependent variable as a list.
            dependent_variable = [dependent_variable]
        self.dependent_variable = dependent_variable

        # optional but common parameters
        self.primary_key = primary_key
        self.parameters = parameters or {}
        self.cv_params = cross_validation_parameters or {}
        self.model_name = model_name  # None lets Platform use template name
        self.excluded_columns = excluded_columns
        self.calibration = calibration
        self.job_resources = {'REQUIRED_CPU': cpu_requested,
                              'REQUIRED_MEMORY': memory_requested,
                              'REQUIRED_DISK_SPACE': disk_requested}
        self.notifications = notifications or {}
        self.dependencies = dependencies
        self.git_token_name = git_token_name
        self.verbose = verbose

        if client is None:
            client = APIClient(resources='all')
        self._client = client
        self.train_result_ = None

        self._set_template_version(client)

        if _NEWEST_CIVISML_VERSION:
            self.etl = etl
        elif not _NEWEST_CIVISML_VERSION and etl is not None:
            raise NotImplementedError("The etl argument is not implemented"
                                      " in this version of CivisML.")

        else:
            # fall back to previous version templates
            self.train_template_id = self._train_template_id_fallback
            self.predict_template_id = self._predict_template_id_fallback

    def __getstate__(self):
        state = self.__dict__.copy()
        del state['_client']
        return state

    def __setstate__(self, state):
        self.__dict__ = state
        self._client = APIClient(resources='all')
        self._set_template_version(self._client)

    @classmethod
    def from_existing(cls, train_job_id, train_run_id='latest', client=None):
        """Create a :class:`ModelPipeline` object from existing model IDs

        Parameters
        ----------
        train_job_id : int
            The ID of the CivisML job in the Civis Platform
        train_run_id : int or string, optional
            Location of the model run, either

            * an explicit run ID,
            * "latest" : The most recent run
            * "active" : The run designated by the training job's
              "active build" parameter
        client : :class:`~civis.APIClient`, optional
            If not provided, an :class:`~civis.APIClient` object will be
            created from the :envvar:`CIVIS_API_KEY`.

        Returns
        -------
        :class:`~civis.ml.ModelPipeline`
            A :class:`~civis.ml.ModelPipeline` which refers to
            a previously-trained model

        Examples
        --------
        >>> from civis.ml import ModelPipeline
        >>> model = ModelPipeline.from_existing(job_id)
        >>> model.train_result_.metrics['roc_auc']
        0.843
        """
        train_job_id = int(train_job_id)  # Convert np.int to int
        if client is None:
            client = APIClient(resources='all')
        train_run_id = _decode_train_run(train_job_id, train_run_id, client)
        try:
            fut = ModelFuture(train_job_id, train_run_id, client=client)
            container = client.scripts.get_containers(train_job_id)
        except CivisAPIError as api_err:
            if api_err.status_code == 404:
                msg = ('There is no Civis Platform job with '
                       'script ID {} and run ID {}!'.format(train_job_id,
                                                            train_run_id))
                six.raise_from(ValueError(msg), api_err)
            raise

        args = container.arguments

        # Older templates used "WORKFLOW" instead of "MODEL"
        model = args.get('MODEL', args.get('WORKFLOW'))
        dependent_variable = args['TARGET_COLUMN'].split()
        primary_key = args.get('PRIMARY_KEY')
        parameters = json.loads(args.get('PARAMS', "{}"))
        cross_validation_parameters = json.loads(args.get('CVPARAMS', "{}"))
        calibration = args.get('CALIBRATION')
        excluded_columns = args.get('EXCLUDE_COLS', None)
        if excluded_columns:
            excluded_columns = excluded_columns.split()
        cpu_requested = args.get('REQUIRED_CPU')
        memory_requested = args.get('REQUIRED_MEMORY')
        disk_requested = args.get('REQUIRED_DISK_SPACE')
        name = container.name
        if name.endswith(' Train'):
            # Strip object-applied suffix
            name = name[:-len(' Train')]
        notifications = {camel_to_snake(key): val for key, val
                         in container.notifications.items()}
        dependencies = args.get('DEPENDENCIES', None)
        if dependencies:
            dependencies = dependencies.split()
        git_token_name = args.get('GIT_CRED', None)
        if git_token_name:
            git_token_name = client.credentials.get(git_token_name).name

        klass = cls(model=model,
                    dependent_variable=dependent_variable,
                    primary_key=primary_key,
                    model_name=name,
                    parameters=parameters,
                    cross_validation_parameters=cross_validation_parameters,
                    calibration=calibration,
                    excluded_columns=excluded_columns,
                    client=client,
                    cpu_requested=cpu_requested,
                    disk_requested=disk_requested,
                    memory_requested=memory_requested,
                    notifications=notifications,
                    dependencies=dependencies,
                    git_token_name=git_token_name,
                    verbose=args.get('DEBUG', False))
        klass.train_result_ = fut

        # Set prediction template corresponding to training template
        template_id = int(container['from_template_id'])
        p_id = _PRED_TEMPLATES.get(template_id)
        if p_id is None:
            warnings.warn('Model %s was trained with a newer version of '
                          'CivisML than is available in the API client '
                          'version %s. Please update your API client version. '
                          'Attempting to use an older version of the '
                          'prediction code. Prediction will either fail '
                          'immediately or succeed.'
                          % (train_job_id, __version__), RuntimeWarning)
            p_id = max(_PRED_TEMPLATES.values())
        klass.predict_template_id = p_id

        return klass

    def train(self, df=None, csv_path=None, table_name=None,
              database_name=None, file_id=None,
              sql_where=None, sql_limit=None, oos_scores=None,
              oos_scores_db=None, if_exists='fail', fit_params=None,
              polling_interval=None, validation_data='train', n_jobs=4):
        """Start a Civis Platform job to train your model

        Provide input through one of
        a :class:`~pandas.DataFrame` (``df``),
        a local CSV (``csv_path``),
        a Civis Table (``table_name`` and ``database_name``), or
        a Civis File containing a CSV (``file_id``).

        Model outputs will always contain out-of-sample scores
        (accessible through :attr:`ModelFuture.table` on this function's
        output), and you may chose to store these out-of-sample scores
        in a Civis Table with the ``oos_scores``, ``oos_scores_db``,
        and ``if_exists`` parameters.

        Parameters
        ----------
        df : pd.DataFrame, optional
            A :class:`~pandas.DataFrame` of training data.
            The :class:`~pandas.DataFrame` will be uploaded to a Civis file so
            that CivisML can access it.
            Note that the index of the :class:`~pandas.DataFrame` will be
            ignored -- use ``df.reset_index()`` if you want your
            index column to be included with the data passed to CivisML.
        csv_path : str, optional
            The location of a CSV of data on the local disk.
            It will be uploaded to a Civis file.
        table_name : str, optional
            The qualified name of the table containing the training set from
            which to build the model.
        database_name : str, optional
            Name of the database holding the training set table used to
            build the model. E.g., 'My Cluster Name'.
        file_id : int, optional
            If the training data are stored in a Civis file,
            provide the integer file ID.
        sql_where : str, optional
            A SQL WHERE clause used to scope the rows of the training set
            (used for table input only)
        sql_limit : int, optional
            SQL LIMIT clause for querying the training set
            (used for table input only)
        oos_scores : str, optional
            If provided, store out-of-sample predictions on
            training set data to this Redshift "schema.tablename".
        oos_scores_db : str, optional
            If not provided, store OOS predictions in the same database
            which holds the training data.
        if_exists : {'fail', 'append', 'drop', 'truncate'}
            Action to take if the out-of-sample prediction table
            already exists.
        fit_params: Dict[str, str]
            Mapping from parameter names in the model's ``fit`` method
            to the column names which hold the data, e.g.
            ``{'sample_weight': 'survey_weight_column'}``.
        polling_interval : float, optional
            Check for job completion every this number of seconds.
            Do not set if using the notifications endpoint.
        validation_data : str, optional
            Source for validation data. There are currently two options:
            `'train'` (the default), which cross-validates over training data
            for validation; and `'skip'`, which skips the validation step.
        n_jobs : int, optional
            Number of jobs to use for training and validation. Defaults to
            4, which allows parallelization over the 4 cross validation folds.
            Increase n_jobs to parallelize over many hyperparameter
            combinations in grid search/hyperband, or decrease to use fewer
            computational resources at once.

        Returns
        -------
        :class:`~civis.ml.ModelFuture`
        """
        if ((table_name is None or database_name is None) and
                file_id is None and df is None and csv_path is None):
            raise ValueError('Provide a source of data.')
        if sum((bool(table_name and database_name),
                bool(file_id), df is not None, csv_path is not None)) > 1:
            raise ValueError('Provide a single source of data.')

        if df is not None:
            file_id = _stash_local_dataframe(df, client=self._client)
        elif csv_path:
            file_id = _stash_local_file(csv_path, client=self._client)

        train_args = {'TARGET_COLUMN': ' '.join(self.dependent_variable),
                      'PRIMARY_KEY': self.primary_key,
                      'PARAMS': json.dumps(self.parameters),
                      'CVPARAMS': json.dumps(self.cv_params),
                      'CALIBRATION': self.calibration,
                      'IF_EXISTS': if_exists}
        if oos_scores:
            train_args['OOSTABLE'] = oos_scores
        if oos_scores_db:
            oos_db_id = self._client.get_database_id(oos_scores_db)
            train_args['OOSDB'] = {'database': oos_db_id}
        if sql_where:
            train_args['WHERESQL'] = sql_where
        if sql_limit:
            train_args['LIMITSQL'] = sql_limit
        if self.excluded_columns:
            train_args['EXCLUDE_COLS'] = ' '.join(self.excluded_columns)
        if fit_params:
            train_args['FIT_PARAMS'] = json.dumps(fit_params)
        if self.dependencies:
            train_args['DEPENDENCIES'] = ' '.join(self.dependencies)
        if _NEWEST_CIVISML_VERSION:
            if validation_data:
                train_args['VALIDATION_DATA'] = validation_data
            if n_jobs:
                train_args['N_JOBS'] = n_jobs

        if HAS_SKLEARN and isinstance(self.model, BaseEstimator):
            try:
                tempdir = tempfile.mkdtemp()
                fout = os.path.join(tempdir, 'estimator.pkl')
                joblib.dump(self.model, fout, compress=3)
                with open(fout, 'rb') as _fout:
                    n = self.model_name if self.model_name else "CivisML"
                    estimator_file_id = cio.file_to_civis(
                        _fout, 'Estimator for ' + n, client=self._client)
                self._input_model = self.model  # Keep the estimator
                self.model = str(estimator_file_id)
            finally:
                shutil.rmtree(tempdir)
        train_args['MODEL'] = self.model

        if HAS_SKLEARN and _NEWEST_CIVISML_VERSION:
            if isinstance(self.etl, BaseEstimator):
                try:
                    tempdir = tempfile.mkdtemp()
                    fout = os.path.join(tempdir, 'ETL.pkl')
                    joblib.dump(self.etl, fout, compress=3)
                    with open(fout, 'rb') as _fout:
                        etl_file_id = cio.file_to_civis(
                            _fout, 'ETL Estimator', client=self._client)
                    train_args['ETL'] = str(etl_file_id)
                finally:
                    shutil.rmtree(tempdir)

        name = self.model_name + ' Train' if self.model_name else None
        # Clear the existing training result so we can make a new one.
        self.train_result_ = None

        result, container, run = self._create_custom_run(
              self.train_template_id,
              job_name=name,
              table_name=table_name,
              database_name=database_name,
              file_id=file_id,
              args=train_args,
              resources=self.job_resources,
              polling_interval=polling_interval)

        self.train_result_ = result

        return result

    def _create_custom_run(self, template_id, job_name=None, table_name=None,
                           database_name=None, file_id=None, args=None,
                           resources=None, polling_interval=None):
        # Handle int-like but non-Python-integer types such as np.int64
        file_id = int(file_id) if file_id is not None else file_id
        script_arguments = {'TABLE_NAME': table_name,
                            'CIVIS_FILE_ID': file_id,
                            'DEBUG': self.verbose}
        if database_name:
            if template_id < 8000:
                # v0 jobs used a different database parameter
                script_arguments['DB_NAME'] = database_name
            else:
                db_id = self._client.get_database_id(database_name)
                script_arguments['DB'] = {'database': db_id}
        resources = resources or {}
        for key, value in resources.items():
            if value:
                # Default resources are set on the template. Only
                # modify via arguments if users give a non-default value.
                script_arguments[key] = value
        if self.git_token_name:
            creds = find(self._client.credentials.list(),
                         name=self.git_token_name,
                         type='Custom')
            if len(creds) > 1:
                raise ValueError("Unique credential with name '{}' for "
                                 "remote git hosting service not found!"
                                 .format(self.git_token_name))
            script_arguments['GIT_CRED'] = creds[0].id

        script_arguments.update(args or {})

        container = self._client.scripts.post_custom(
            from_template_id=template_id,
            name=job_name,
            arguments=script_arguments,
            notifications=self.notifications)
        log.info('Created custom script %s.', container.id)

        run = self._client.scripts.post_custom_runs(container.id)
        log.debug('Started job %s, run %s.', container.id, run.id)

        train_kwargs = {}
        if self.train_result_ is not None:
            train_kwargs = {'train_job_id': self.train_result_.job_id,
                            'train_run_id': self.train_result_.run_id}
        fut = ModelFuture(
              container.id,
              run.id,
              client=self._client,
              polling_interval=polling_interval,
              poll_on_creation=False,
              **train_kwargs)

        return fut, container, run

    @property
    @_check_fit_initiated
    def state(self):
        return self.train_result_.state

    @property
    @_check_fit_initiated
    def estimator(self):
        return self.train_result_.estimator

    @_check_fit_initiated
    def predict(self, df=None, csv_path=None,
                table_name=None, database_name=None,
                manifest=None, file_id=None, sql_where=None, sql_limit=None,
                primary_key=SENTINEL, output_table=None, output_db=None,
                if_exists='fail', n_jobs=None, polling_interval=None,
                cpu=None, memory=None, disk_space=None):
        """Make predictions on a trained model

        Provide input through one of
        a :class:`~pandas.DataFrame` (``df``),
        a local CSV (``csv_path``),
        a Civis Table (``table_name`` and ``database_name``),
        a Civis File containing a CSV (``file_id``), or
        a Civis File containing a manifest file (``manifest``).

        A "manifest file" is JSON which specifies the location of
        many shards of the data to be used for prediction.
        A manifest file is the output of a Civis
        export job with ``force_multifile=True`` set,
        e.g. from :func:`civis.io.civis_to_multifile_csv`.
        Large Civis Tables (provided using ``table_name``)
        will automatically be exported to manifest files.

        Prediction outputs will always be stored as gzipped
        CSVs in one or more Civis Files. You can find a list of
        File ID numbers for output files at the "output_file_ids"
        key in the metadata returned by the prediction job.
        Provide an ``output_table`` (and optionally an ``output_db``,
        if it's different from ``database_name``) to copy these
        predictions into a Civis Table.

        Parameters
        ----------
        df : pd.DataFrame, optional
            A :class:`~pandas.DataFrame` of data for prediction.
            The :class:`~pandas.DataFrame` will be uploaded to a Civis file so
            that CivisML can access it.
            Note that the index of the :class:`~pandas.DataFrame` will be
            ignored -- use ``df.reset_index()`` if you want your
            index column to be included with the data passed to CivisML.
        csv_path : str, optional
            The location of a CSV of data on the local disk.
            It will be uploaded to a Civis file.
        table_name : str, optional
            The qualified name of the table containing your data
        database_name : str, optional
            Name of the database holding the
            data, e.g., 'My Redshift Cluster'.
        manifest : int, optional
            ID for a manifest file stored as a Civis file.
            (Note: if the manifest is not a Civis Platform-specific manifest,
            like the one returned from :func:`civis.io.civis_to_multfile_csv`,
            this must be used in conjunction with table_name and database_name
            due to the need for column discovery via Redshift.)
        file_id : int, optional
            If the data are a CSV stored in a Civis file,
            provide the integer file ID.
        sql_where : str, optional
            A SQL WHERE clause used to scope the rows to be predicted
        sql_limit : int, optional
            SQL LIMIT clause to restrict the size of the prediction set
        primary_key : str, optional
            Primary key of the prediction table. Defaults to
            the primary key of the training data. Use ``None`` to
            indicate that the prediction data don't have a
            primary key column.
        output_table: str, optional
            The table in which to put the predictions.
        output_db : str, optional
            Database of the output table. Defaults to the database
            of the input table.
        if_exists : {'fail', 'append', 'drop', 'truncate'}
            Action to take if the prediction table already exists.
        n_jobs : int, optional
            Number of concurrent Platform jobs to use
            for multi-file / large table prediction.
        polling_interval : float, optional
            Check for job completion every this number of seconds.
            Do not set if using the notifications endpoint.
        cpu : int, optional
            CPU shares requested by the user for a single job.
        memory : int, optional
            RAM requested by the user for a single job.
        disk_space : float, optional
            disk space requested by the user for a single job.

        Returns
        -------
        :class:`~civis.ml.ModelFuture`
        """
        self.train_result_.result()  # Blocks and raises training errors

        if ((table_name is None or database_name is None) and
                file_id is None and df is None and csv_path is None and
                manifest is None):
            raise ValueError('Provide a source of data.')
        if sum((bool(table_name and database_name) or (manifest is not None),
                bool(file_id), df is not None, csv_path is not None)) > 1:
            raise ValueError('Provide a single source of data.')

        if df is not None:
            file_id = _stash_local_dataframe(df, client=self._client)
        elif csv_path:
            file_id = _stash_local_file(csv_path, client=self._client)

        if primary_key is SENTINEL:
            primary_key = self.primary_key

        predict_args = {'TRAIN_JOB': self.train_result_.job_id,
                        'TRAIN_RUN': self.train_result_.run_id,
                        'PRIMARY_KEY': primary_key,
                        'IF_EXISTS': if_exists}
        if output_table:
            predict_args['OUTPUT_TABLE'] = output_table
        if output_db:
            if self.predict_template_id == 7021:
                # v0 jobs used a different database parameter
                predict_args['OUTPUT_DB'] = output_db
            else:
                output_db_id = self._client.get_database_id(output_db)
                predict_args['OUTPUT_DB'] = {'database': output_db_id}

        if manifest:
            predict_args['MANIFEST'] = manifest
        if sql_where:
            predict_args['WHERESQL'] = sql_where
        if sql_limit:
            predict_args['LIMITSQL'] = sql_limit
        if n_jobs:
            predict_args['N_JOBS'] = n_jobs
        if _NEWEST_CIVISML_VERSION:
            if cpu:
                predict_args['CPU'] = cpu
            if memory:
                predict_args['MEMORY'] = memory
            if disk_space:
                predict_args['DISK_SPACE'] = disk_space

        name = self.model_name + ' Predict' if self.model_name else None
        result, container, run = self._create_custom_run(
            self.predict_template_id,
            job_name=name,
            table_name=table_name,
            database_name=database_name,
            file_id=file_id,
            args=predict_args,
            polling_interval=polling_interval)

        return result