Esempio n. 1
0
def civis_to_csv(filename,
                 sql,
                 database,
                 job_name=None,
                 api_key=None,
                 credential_id=None,
                 polling_interval=_DEFAULT_POLLING_INTERVAL,
                 archive=True):
    """Export data from Civis to a local CSV file.

    Parameters
    ----------
    filename : str
        Download exported data into this file.
    sql : str, optional
        The SQL select string to be executed.
    database : str or int
        Export data from this database. Can be the database name or ID.
    job_name : str, optional
        A name to give the job. If omitted, a random job name will be
        used.
    api_key : str, optional
        Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY`
        environment variable will be used.
    credential_id : str or int, optional
        The ID of the database credential.  If ``None``, the default
        credential will be used.
    polling_interval : int or float, optional
        Number of seconds to wait between checks for query completion.
    archive : bool, optional
        If ``True`` (the default), archive the export job as soon as it
        completes.

    Returns
    -------
    results : :class:`~civis.polling.PollableResult`
        A `PollableResult` object.

    Examples
    --------
    >>> sql = "SELECT * FROM schema.table"
    >>> poll = civis_to_csv("file.csv", sql, "my_database")
    >>> poll.result()  # Wait for job to complete

    See Also
    --------
    civis.io.read_civis : Read table contents into memory.
    civis.io.read_civis_sql : Read results of a SQL query into memory.
    """
    client = APIClient(api_key=api_key)
    script_id, run_id = _sql_script(client, sql, database, job_name,
                                    credential_id)
    poll = PollableResult(client.scripts.get_sql_runs, (script_id, run_id),
                          polling_interval)
    download = _download_callback(script_id, run_id, client, filename)
    poll.add_done_callback(download)
    if archive:

        def f(x):
            return client.scripts.put_sql_archive(script_id, True)

        poll.add_done_callback(f)

    return poll
Esempio n. 2
0
def csv_to_civis(filename,
                 database,
                 table,
                 api_key=None,
                 max_errors=None,
                 existing_table_rows="fail",
                 distkey=None,
                 sortkey1=None,
                 sortkey2=None,
                 delimiter=",",
                 headers=None,
                 credential_id=None,
                 polling_interval=_DEFAULT_POLLING_INTERVAL,
                 archive=True):
    """Upload the contents of a local CSV file to Civis.

    Parameters
    ----------
    filename : str
        Upload the contents of this file.
    database : str or int
        Upload data into this database. Can be the database name or ID.
    table : str
        The schema and table you want to upload to. E.g.,
        ``'scratch.table'``.
    api_key : str, optional
        Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY`
        environment variable will be used.
    max_errors : int, optional
        The maximum number of rows with errors to remove from the import
        before failing.
    existing_table_rows : str, optional
        The behaviour if a table with the requested name already exists.
        One of ``'fail'``, ``'truncate'`` or ``'append'``. Defaults to
        ``'fail'``.
    distkey : str, optional
        The column to use as the distkey for the table.
    sortkey1 : str, optional
        The column to use as the sortkey for the table.
    sortkey2 : str, optional
        The second column in a compound sortkey for the table.
    delimiter : string, optional
        The column delimiter. One of ``','``, ``'\\t'`` or ``'|'``.
    headers : bool, optional
        Whether or not the first row of the file should be treated as
        headers. The default, ``None``, attempts to autodetect whether
        or not the first row contains headers.
    credential_id : str or int, optional
        The ID of the database credential.  If ``None``, the default
        credential will be used.
    polling_interval : int or float, optional
        Number of seconds to wait between checks for job completion.
    archive : bool, optional
        If ``True`` (the default), archive the import job as soon as it
        completes.

    Returns
    -------
    results : :class:`~civis.polling.PollableResult`
        A `PollableResult` object.

    Notes
    -----
    This reads the contents of `filename` into memory.

    Examples
    --------
    >>> with open('input_file.csv', 'w') as _input:
    ...     _input.write('a,b,c\\n1,2,3')
    >>> poller = civis.io.csv_to_civis('input_file.csv',
    ...                                'my-database',
    ...                                'scratch.my_data')
    >>> poller.result()
    """
    client = APIClient(api_key=api_key)
    schema, table = table.split(".", 1)
    db_id = client.get_database_id(database)
    cred_id = credential_id or client.default_credential
    delimiter = DELIMITERS.get(delimiter)
    assert delimiter, "delimiter must be one of {}".format(DELIMITERS.keys())

    kwargs = dict(schema=schema,
                  name=table,
                  remote_host_id=db_id,
                  credential_id=cred_id,
                  max_errors=max_errors,
                  existing_table_rows=existing_table_rows,
                  distkey=distkey,
                  sortkey1=sortkey1,
                  sortkey2=sortkey2,
                  column_delimiter=delimiter,
                  first_row_is_header=headers)

    import_job = client.imports.post_files(**kwargs)
    with open(filename, "rb") as data:
        put_response = requests.put(import_job.upload_uri, data)
    put_response.raise_for_status()
    run_job_result = client._session.post(import_job.run_uri)
    run_job_result.raise_for_status()
    run_info = run_job_result.json()
    poll = PollableResult(client.imports.get_files_runs,
                          (run_info['importId'], run_info['id']),
                          polling_interval=polling_interval)
    if archive:

        def f(x):
            return client.imports.put_archive(import_job.id, True)

        poll.add_done_callback(f)
    return poll
Esempio n. 3
0
def read_civis_sql(sql,
                   database,
                   use_pandas=False,
                   job_name=None,
                   api_key=None,
                   credential_id=None,
                   polling_interval=_DEFAULT_POLLING_INTERVAL,
                   archive=True,
                   **kwargs):
    """Read data from Civis using a custom SQL string.

    Parameters
    ----------
    sql : str, optional
        The SQL select string to be executed.
    database : str or int
        Execute the query against this database. Can be the database name
        or ID.
    use_pandas : bool, optional
        If ``True``, return a :class:`pandas:pandas.DataFrame`. Otherwise,
        return a list of results from :func:`python:csv.reader`.
    job_name : str, optional
        A name to give the job. If omitted, a random job name will be
        used.
    api_key : str, optional
        Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY`
        environment variable will be used.
    credential_id : str or int, optional
        The database credential ID.  If ``None``, the default credential
        will be used.
    polling_interval : int or float, optional
        Number of seconds to wait between checks for query completion.
    archive : bool, optional
        If ``True`` (the default), archive the export job as soon as it
        completes.
    **kwargs : kwargs
        Extra keyword arguments are passed into
        :func:`pandas:pandas.read_csv` if `use_pandas` is ``True`` or
        passed into :func:`python:csv.reader` if `use_pandas` is
        ``False``.

    Returns
    -------
    data : :class:`pandas:pandas.DataFrame` or list
        A list of rows (with header as first row) if `use_pandas` is
        ``False``, otherwise a `pandas` `DataFrame`. Note that if
        `use_pandas` is ``False``, no parsing of types is performed and
        each row will be a list of strings.

    Raises
    ------
    ImportError
        If `use_pandas` is ``True`` and `pandas` is not installed.

    Examples
    --------
    >>> sql = "SELECT * FROM schema.table"
    >>> df = read_civis_sql(sql, "my_database", use_pandas=True)
    >>> col_a = df["column_a"]

    >>> data = read_civis_sql(sql, "my_database")
    >>> columns = data.pop(0)
    >>> col_a_index = columns.index("column_a")
    >>> col_a = [row[col_a_index] for row in data]

    Notes
    -----
    This reads the data into memory.

    See Also
    --------
    civis.io.read_civis : Read directly into memory without SQL.
    civis.io.civis_to_csv : Write directly to a CSV file.
    """
    if use_pandas and NO_PANDAS:
        raise ImportError("use_pandas is True but pandas is not installed.")
    client = APIClient(api_key=api_key)
    script_id, run_id = _sql_script(client, sql, database, job_name,
                                    credential_id)
    poll = PollableResult(client.scripts.get_sql_runs, (script_id, run_id),
                          polling_interval)
    if archive:

        def f(x):
            return client.scripts.put_sql_archive(script_id, True)

        poll.add_done_callback(f)
    poll.result()
    outputs = client.scripts.get_sql_runs(script_id, run_id)["output"]
    if not outputs:
        raise EmptyResultError(
            "Query {} returned no output.".format(script_id))
    url = outputs[0]["path"]
    if use_pandas:
        data = pd.read_csv(url, **kwargs)
    else:
        r = requests.get(url)
        r.raise_for_status()
        data = list(csv.reader(StringIO(r.text), **kwargs))
    return data