コード例 #1
0
ファイル: dataframe.py プロジェクト: zarathomas/tamr-toolbox
def from_dataset(
    dataset: Dataset,
    *,
    columns: Optional[List[str]] = None,
    flatten_delimiter: Optional[str] = None,
    flatten_columns: Optional[List[str]] = None,
    force_flatten: bool = False,
    nrows: Optional[int] = None,
    allow_dataset_refresh: bool = False,
) -> "pandas.DataFrame":
    """
    Creates a DataFrame from a Tamr Dataset

    Args:
        dataset: Tamr Dataset object
        columns: optional, ordered list of columns to keep
        flatten_delimiter: if set, flatten list types to strings by concatenating with this
            delimiter
        flatten_columns: optional, list of columns to flatten
        force_flatten:  if False, arrays with inner types other than string will not be flattened.
            if True, will force all inner types to strings when flattening values.
        nrows: number of rows to read. default None will read all rows
        allow_dataset_refresh: if True, allows running a job to refresh dataset to make streamable

    Returns:
        DataFrame

    Raises:
        ValueError: if `columns` or `flatten_columns` contain columns that are not present in
            `dataset`
    """
    # This function requires pandas, an optional dependency
    import pandas

    LOGGER.info(
        f"Streaming records to DataFrame for dataset {dataset.name} (id={dataset.resource_id})."
    )
    dataset_attrs = [attr for attr in dataset.attributes]
    attr_names = [attr.name for attr in dataset_attrs]
    # check that specified columns exist
    if columns is not None:
        common._check_columns_subset(
            input_list=columns, reference_list=attr_names, raise_error=True
        )
    # checks on columns to flatten
    if flatten_delimiter is not None:
        if flatten_columns is None:
            flatten_columns = list(attr_names)
        else:
            # check that specified columns exist
            common._check_columns_subset(
                input_list=flatten_columns, reference_list=attr_names, raise_error=True
            )
        # check types of flatten_columns
        for attr in dataset_attrs:
            if attr.name not in flatten_columns:
                continue
            attr_type = attr.spec().to_dict()["type"]
            if attr_type["baseType"] == "ARRAY" and attr_type["innerType"]["baseType"] != "STRING":
                if force_flatten:
                    LOGGER.info(
                        f"Will force attribute to string: {attr.name}, with type: {attr_type}"
                    )
                else:
                    LOGGER.warning(
                        f"Will not flatten attribute: {attr.name}, with type: {attr_type}"
                    )
                    flatten_columns.remove(attr.name)

    if not dataset.status().is_streamable:
        if allow_dataset_refresh:
            LOGGER.info(f"Refreshing dataset {dataset.name} to make streamable.")
            op = dataset.refresh()
            operation.enforce_success(op)
        else:
            message = (
                f"Dataset {dataset.name} is not streamable. Refresh it first, or run"
                f" with allow_dataset_refresh=True"
            )
            LOGGER.error(message)
            raise RuntimeError(message)

    # if flattening, set the function to apply to records as _flatten_list
    # otherwise set as _identity
    func = None
    if flatten_delimiter is not None:
        func = partial(common._flatten_list, delimiter=flatten_delimiter, force=force_flatten)
    df = pandas.DataFrame.from_records(
        common._yield_records(
            dataset, func=func, columns=columns, flatten_columns=flatten_columns
        ),
        columns=columns,
        nrows=nrows,
    )
    return df
コード例 #2
0
def from_dataset(
    dataset: Dataset,
    export_file_path: Union[Path, str],
    *,
    csv_delimiter: str = ",",
    columns: Optional[List[str]] = None,
    flatten_delimiter: str = "|",
    quote_character: str = '"',
    quoting: int = csv.QUOTE_MINIMAL,
    na_value: str = "NaN",
    nrows: Optional[int] = None,
    allow_dataset_refresh: bool = False,
    buffer_size: int = 10000,
    overwrite: bool = False,
) -> int:
    """
    Export a Tamr Dataset to a csv file. Records are streamed to disk and written according to a
    given buffer size. As a result this is more memory efficient than first reading to a
    pandas.DataFrame and writing to CSV.

    Args:
        dataset: Tamr Dataset object
        export_file_path: Path to the csv file where the dataset will be saved
        csv_delimiter: Delimiter of the csv file
        columns: Optional, Ordered list of columns to write. If None, write all columns in
            arbitrary order.
        flatten_delimiter: Flatten list types to strings by concatenating with this delimiter
        quote_character: Character used to escape value for csv delimiter when it appears in the
            value.
        quoting: The escape strategy to use according to the Python csv writer.
            See https://docs.python.org/2/library/csv.html#csv.QUOTE_MINIMAL
        na_value: Value to write that represents empty or missing data.
            See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
            for the na_values supported by default in pandas.read_csv
        nrows: Optional, Number of rows to write. If None, then write all rows.
        allow_dataset_refresh: If True, allows running a job to refresh dataset to make streamable.
            Otherwise a RuntimeError will be thrown if the dataset is unstreamable.
        buffer_size: Number of records to store in memory before writing to disk
        overwrite: if True and export_file_name already exists, overwrite the file.
            Otherwise throw an error

    Returns:
        The total number of records written

    Raises:
        FileExistsError: if the csv file to which the dataset is to be streamed exists
            and `overwrite` is False
        RuntimeError: if `dataset` is not streamable and `allow_dataset_refresh` is False
        ValueError: if `columns` or `flatten_columns` contain columns that are not
            present in `dataset`
    """
    LOGGER.info(
        f"Streaming records to csv file {export_file_path} from dataset {dataset.name} "
        f"(id={dataset.resource_id}).")

    if os.path.exists(export_file_path):
        if not overwrite:
            message = (
                f"CSV file {export_file_path} already exists. "
                f"(Set 'overwrite' flag to True if you wish to overwrite)")
            LOGGER.error(message)
            raise FileExistsError(message)
        else:
            LOGGER.warning(
                f"CSV file {export_file_path} already exists and will be overwritten"
            )

    if csv_delimiter == flatten_delimiter:
        message = (
            f"The CSV delimiter '{csv_delimiter}' cannot be identical to the list "
            f"flattening delimiter '{flatten_delimiter}'")
        LOGGER.error(message)
        raise ValueError(message)

    attribute_names = [attr.name for attr in dataset.attributes]

    # check that specified columns exist
    if columns is not None:
        common._check_columns_subset(input_list=columns,
                                     reference_list=attribute_names,
                                     raise_error=True)

    if not dataset.status().is_streamable:
        if allow_dataset_refresh:
            LOGGER.info(
                f"Refreshing dataset {dataset.name} to make streamable.")
            op = dataset.refresh()
            operation.enforce_success(op)
        else:
            message = (
                f"Dataset {dataset.name} is not streamable. Refresh it first, or "
                f"run with allow_dataset_refresh=True")
            LOGGER.error(message)
            raise RuntimeError(message)

    func = partial(common._flatten_list,
                   delimiter=flatten_delimiter,
                   force=True)

    # Open CSV file and use newline='' as recommended by
    # https://docs.python.org/3/library/csv.html#csv.writer
    with open(export_file_path, "w", newline="") as csv_file:
        csv_writer = csv.writer(
            csv_file,
            delimiter=csv_delimiter,
            quotechar=quote_character,
            quoting=quoting,
        )
        buffer = []
        header = None
        # Set record number to -1 in case the dataset streamed has no records
        record_number = -1

        for record_number, record in enumerate(
                common._yield_records(dataset, func=func, columns=columns)):
            # Obtain and write the header information only on the first pass
            if header is None:
                header = record.keys() if columns is None else columns
                csv_writer.writerow(header)

            # Replace empty values with a specific null value
            # This also allows nulls to be treated differently from empty strings
            record = [
                na_value if record[k] is None else record[k] for k in header
            ]
            buffer.append(record)

            at_max_buffer = buffer_size is not None and (len(buffer) >=
                                                         buffer_size)
            at_max_rows = nrows is not None and record_number >= nrows - 1
            if at_max_buffer or at_max_rows:
                csv_writer.writerows(buffer)
                LOGGER.debug(
                    f"Written dataset {dataset.name} up to record {record_number+1}"
                )
                buffer = []
                if at_max_rows:
                    break

        # Write anything remaining
        # This will occur whenever the buffer is non-zero and the number of records
        # is not exactly divisible by the buffer number
        # For example, writing a dataset with 1100 records using a buffer size of 500
        # will write in 3 chunks: 2 x 500 above and the remaining 100 handled here
        if len(buffer) != 0:
            LOGGER.debug(
                f"Written dataset {dataset.name} up to record {record_number + 1}"
            )
            csv_writer.writerows(buffer)

        if record_number == -1:
            # If record number is -1 then no records were streamed, possibly because the dataset
            # has no records. We therefore want to simply save the headers
            if columns is not None:
                csv_writer.writerow(columns)
            else:
                csv_writer.writerow(attribute_names)

    records_written = record_number + 1

    LOGGER.info(
        f"Wrote {records_written} from dataset {dataset.name} (id={dataset.resource_id}) "
        f"to {export_file_path}")

    return records_written