def from_dataset( dataset: Dataset, *, columns: Optional[List[str]] = None, flatten_delimiter: Optional[str] = None, flatten_columns: Optional[List[str]] = None, force_flatten: bool = False, nrows: Optional[int] = None, allow_dataset_refresh: bool = False, ) -> "pandas.DataFrame": """ Creates a DataFrame from a Tamr Dataset Args: dataset: Tamr Dataset object columns: optional, ordered list of columns to keep flatten_delimiter: if set, flatten list types to strings by concatenating with this delimiter flatten_columns: optional, list of columns to flatten force_flatten: if False, arrays with inner types other than string will not be flattened. if True, will force all inner types to strings when flattening values. nrows: number of rows to read. default None will read all rows allow_dataset_refresh: if True, allows running a job to refresh dataset to make streamable Returns: DataFrame Raises: ValueError: if `columns` or `flatten_columns` contain columns that are not present in `dataset` """ # This function requires pandas, an optional dependency import pandas LOGGER.info( f"Streaming records to DataFrame for dataset {dataset.name} (id={dataset.resource_id})." ) dataset_attrs = [attr for attr in dataset.attributes] attr_names = [attr.name for attr in dataset_attrs] # check that specified columns exist if columns is not None: common._check_columns_subset( input_list=columns, reference_list=attr_names, raise_error=True ) # checks on columns to flatten if flatten_delimiter is not None: if flatten_columns is None: flatten_columns = list(attr_names) else: # check that specified columns exist common._check_columns_subset( input_list=flatten_columns, reference_list=attr_names, raise_error=True ) # check types of flatten_columns for attr in dataset_attrs: if attr.name not in flatten_columns: continue attr_type = attr.spec().to_dict()["type"] if attr_type["baseType"] == "ARRAY" and attr_type["innerType"]["baseType"] != "STRING": if force_flatten: LOGGER.info( f"Will force attribute to string: {attr.name}, with type: {attr_type}" ) else: LOGGER.warning( f"Will not flatten attribute: {attr.name}, with type: {attr_type}" ) flatten_columns.remove(attr.name) if not dataset.status().is_streamable: if allow_dataset_refresh: LOGGER.info(f"Refreshing dataset {dataset.name} to make streamable.") op = dataset.refresh() operation.enforce_success(op) else: message = ( f"Dataset {dataset.name} is not streamable. Refresh it first, or run" f" with allow_dataset_refresh=True" ) LOGGER.error(message) raise RuntimeError(message) # if flattening, set the function to apply to records as _flatten_list # otherwise set as _identity func = None if flatten_delimiter is not None: func = partial(common._flatten_list, delimiter=flatten_delimiter, force=force_flatten) df = pandas.DataFrame.from_records( common._yield_records( dataset, func=func, columns=columns, flatten_columns=flatten_columns ), columns=columns, nrows=nrows, ) return df
def from_dataset( dataset: Dataset, export_file_path: Union[Path, str], *, csv_delimiter: str = ",", columns: Optional[List[str]] = None, flatten_delimiter: str = "|", quote_character: str = '"', quoting: int = csv.QUOTE_MINIMAL, na_value: str = "NaN", nrows: Optional[int] = None, allow_dataset_refresh: bool = False, buffer_size: int = 10000, overwrite: bool = False, ) -> int: """ Export a Tamr Dataset to a csv file. Records are streamed to disk and written according to a given buffer size. As a result this is more memory efficient than first reading to a pandas.DataFrame and writing to CSV. Args: dataset: Tamr Dataset object export_file_path: Path to the csv file where the dataset will be saved csv_delimiter: Delimiter of the csv file columns: Optional, Ordered list of columns to write. If None, write all columns in arbitrary order. flatten_delimiter: Flatten list types to strings by concatenating with this delimiter quote_character: Character used to escape value for csv delimiter when it appears in the value. quoting: The escape strategy to use according to the Python csv writer. See https://docs.python.org/2/library/csv.html#csv.QUOTE_MINIMAL na_value: Value to write that represents empty or missing data. See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html for the na_values supported by default in pandas.read_csv nrows: Optional, Number of rows to write. If None, then write all rows. allow_dataset_refresh: If True, allows running a job to refresh dataset to make streamable. Otherwise a RuntimeError will be thrown if the dataset is unstreamable. buffer_size: Number of records to store in memory before writing to disk overwrite: if True and export_file_name already exists, overwrite the file. Otherwise throw an error Returns: The total number of records written Raises: FileExistsError: if the csv file to which the dataset is to be streamed exists and `overwrite` is False RuntimeError: if `dataset` is not streamable and `allow_dataset_refresh` is False ValueError: if `columns` or `flatten_columns` contain columns that are not present in `dataset` """ LOGGER.info( f"Streaming records to csv file {export_file_path} from dataset {dataset.name} " f"(id={dataset.resource_id}).") if os.path.exists(export_file_path): if not overwrite: message = ( f"CSV file {export_file_path} already exists. " f"(Set 'overwrite' flag to True if you wish to overwrite)") LOGGER.error(message) raise FileExistsError(message) else: LOGGER.warning( f"CSV file {export_file_path} already exists and will be overwritten" ) if csv_delimiter == flatten_delimiter: message = ( f"The CSV delimiter '{csv_delimiter}' cannot be identical to the list " f"flattening delimiter '{flatten_delimiter}'") LOGGER.error(message) raise ValueError(message) attribute_names = [attr.name for attr in dataset.attributes] # check that specified columns exist if columns is not None: common._check_columns_subset(input_list=columns, reference_list=attribute_names, raise_error=True) if not dataset.status().is_streamable: if allow_dataset_refresh: LOGGER.info( f"Refreshing dataset {dataset.name} to make streamable.") op = dataset.refresh() operation.enforce_success(op) else: message = ( f"Dataset {dataset.name} is not streamable. Refresh it first, or " f"run with allow_dataset_refresh=True") LOGGER.error(message) raise RuntimeError(message) func = partial(common._flatten_list, delimiter=flatten_delimiter, force=True) # Open CSV file and use newline='' as recommended by # https://docs.python.org/3/library/csv.html#csv.writer with open(export_file_path, "w", newline="") as csv_file: csv_writer = csv.writer( csv_file, delimiter=csv_delimiter, quotechar=quote_character, quoting=quoting, ) buffer = [] header = None # Set record number to -1 in case the dataset streamed has no records record_number = -1 for record_number, record in enumerate( common._yield_records(dataset, func=func, columns=columns)): # Obtain and write the header information only on the first pass if header is None: header = record.keys() if columns is None else columns csv_writer.writerow(header) # Replace empty values with a specific null value # This also allows nulls to be treated differently from empty strings record = [ na_value if record[k] is None else record[k] for k in header ] buffer.append(record) at_max_buffer = buffer_size is not None and (len(buffer) >= buffer_size) at_max_rows = nrows is not None and record_number >= nrows - 1 if at_max_buffer or at_max_rows: csv_writer.writerows(buffer) LOGGER.debug( f"Written dataset {dataset.name} up to record {record_number+1}" ) buffer = [] if at_max_rows: break # Write anything remaining # This will occur whenever the buffer is non-zero and the number of records # is not exactly divisible by the buffer number # For example, writing a dataset with 1100 records using a buffer size of 500 # will write in 3 chunks: 2 x 500 above and the remaining 100 handled here if len(buffer) != 0: LOGGER.debug( f"Written dataset {dataset.name} up to record {record_number + 1}" ) csv_writer.writerows(buffer) if record_number == -1: # If record number is -1 then no records were streamed, possibly because the dataset # has no records. We therefore want to simply save the headers if columns is not None: csv_writer.writerow(columns) else: csv_writer.writerow(attribute_names) records_written = record_number + 1 LOGGER.info( f"Wrote {records_written} from dataset {dataset.name} (id={dataset.resource_id}) " f"to {export_file_path}") return records_written