def dataframe_to_parquet(dataframe, bq_schema, filepath, parquet_compression="SNAPPY"): """Write dataframe as a Parquet file, according to the desired BQ schema. This function requires the :mod:`pyarrow` package. Arrow is used as an intermediate format. Args: dataframe (pandas.DataFrame): DataFrame to convert to Parquet file. bq_schema (Sequence[Union[ \ :class:`~google.cloud.bigquery.schema.SchemaField`, \ Mapping[str, Any] \ ]]): Desired BigQuery schema. Number of columns must match number of columns in the DataFrame. filepath (str): Path to write Parquet file to. parquet_compression (Optional[str]): The compression codec to use by the the ``pyarrow.parquet.write_table`` serializing method. Defaults to "SNAPPY". https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table """ if pyarrow is None: raise ValueError("pyarrow is required for BigQuery schema conversion.") bq_schema = schema._to_schema_fields(bq_schema) arrow_table = dataframe_to_arrow(dataframe, bq_schema) pyarrow.parquet.write_table(arrow_table, filepath, compression=parquet_compression)
def dataframe_to_arrow(dataframe, bq_schema): """Convert pandas dataframe to Arrow table, using BigQuery schema. Args: dataframe (pandas.DataFrame): DataFrame to convert to Arrow table. bq_schema (Sequence[Union[ \ :class:`~google.cloud.bigquery.schema.SchemaField`, \ Mapping[str, Any] \ ]]): Desired BigQuery schema. The number of columns must match the number of columns in the DataFrame. Returns: pyarrow.Table: Table containing dataframe data, with schema derived from BigQuery schema. """ column_names = set(dataframe.columns) column_and_index_names = set( name for name, _ in list_columns_and_indexes(dataframe) ) bq_schema = schema._to_schema_fields(bq_schema) bq_field_names = set(field.name for field in bq_schema) extra_fields = bq_field_names - column_and_index_names if extra_fields: raise ValueError( u"bq_schema contains fields not present in dataframe: {}".format( extra_fields ) ) # It's okay for indexes to be missing from bq_schema, but it's not okay to # be missing columns. missing_fields = column_names - bq_field_names if missing_fields: raise ValueError( u"bq_schema is missing fields from dataframe: {}".format(missing_fields) ) arrow_arrays = [] arrow_names = [] arrow_fields = [] for bq_field in bq_schema: arrow_fields.append(bq_to_arrow_field(bq_field)) arrow_names.append(bq_field.name) arrow_arrays.append( bq_to_arrow_array(get_column_or_index(dataframe, bq_field.name), bq_field) ) if all((field is not None for field in arrow_fields)): return pyarrow.Table.from_arrays( arrow_arrays, schema=pyarrow.schema(arrow_fields) ) return pyarrow.Table.from_arrays(arrow_arrays, names=arrow_names)
def schema(self, value): if value is None: self._del_sub_prop("schema") return value = _to_schema_fields(value) _helpers._set_sub_prop( self._properties, ["load", "schema", "fields"], [field.to_api_repr() for field in value], )
def dataframe_to_parquet( dataframe, bq_schema, filepath, parquet_compression="SNAPPY", parquet_use_compliant_nested_type=True, ): """Write dataframe as a Parquet file, according to the desired BQ schema. This function requires the :mod:`pyarrow` package. Arrow is used as an intermediate format. Args: dataframe (pandas.DataFrame): DataFrame to convert to Parquet file. bq_schema (Sequence[Union[ \ :class:`~google.cloud.bigquery.schema.SchemaField`, \ Mapping[str, Any] \ ]]): Desired BigQuery schema. Number of columns must match number of columns in the DataFrame. filepath (str): Path to write Parquet file to. parquet_compression (Optional[str]): The compression codec to use by the the ``pyarrow.parquet.write_table`` serializing method. Defaults to "SNAPPY". https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table parquet_use_compliant_nested_type (bool): Whether the ``pyarrow.parquet.write_table`` serializing method should write compliant Parquet nested type (lists). Defaults to ``True``. https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#nested-types https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table This argument is ignored for ``pyarrow`` versions earlier than ``4.0.0``. """ pyarrow = _helpers.PYARROW_VERSIONS.try_import(raise_if_error=True) import pyarrow.parquet kwargs = ({ "use_compliant_nested_type": parquet_use_compliant_nested_type } if _helpers.PYARROW_VERSIONS.use_compliant_nested_type else {}) bq_schema = schema._to_schema_fields(bq_schema) arrow_table = dataframe_to_arrow(dataframe, bq_schema) pyarrow.parquet.write_table( arrow_table, filepath, compression=parquet_compression, **kwargs, )
def download_arrow_row_iterator(pages, bq_schema): """Use HTTP JSON RowIterator to construct an iterable of RecordBatches. Args: pages (Iterator[:class:`google.api_core.page_iterator.Page`]): An iterator over the result pages. bq_schema (Sequence[Union[ \ :class:`~google.cloud.bigquery.schema.SchemaField`, \ Mapping[str, Any] \ ]]): A decription of the fields in result pages. Yields: :class:`pyarrow.RecordBatch` The next page of records as a ``pyarrow`` record batch. """ bq_schema = schema._to_schema_fields(bq_schema) column_names = bq_to_arrow_schema(bq_schema) or [field.name for field in bq_schema] arrow_types = [bq_to_arrow_data_type(field) for field in bq_schema] for page in pages: yield _row_iterator_page_to_arrow(page, column_names, arrow_types)
def download_dataframe_row_iterator(pages, bq_schema, dtypes): """Use HTTP JSON RowIterator to construct a DataFrame. Args: pages (Iterator[:class:`google.api_core.page_iterator.Page`]): An iterator over the result pages. bq_schema (Sequence[Union[ \ :class:`~google.cloud.bigquery.schema.SchemaField`, \ Mapping[str, Any] \ ]]): A decription of the fields in result pages. dtypes(Mapping[str, numpy.dtype]): The types of columns in result data to hint construction of the resulting DataFrame. Not all column types have to be specified. Yields: :class:`pandas.DataFrame` The next page of records as a ``pandas.DataFrame`` record batch. """ bq_schema = schema._to_schema_fields(bq_schema) column_names = [field.name for field in bq_schema] for page in pages: yield _row_iterator_page_to_dataframe(page, column_names, dtypes)
def _rows_from_json(values, schema): """Convert JSON row data to rows with appropriate types. Args: values (Sequence[Dict]): The list of responses (JSON rows) to convert. schema (Sequence[Union[ \ :class:`~google.cloud.bigquery.schema.SchemaField`, \ Mapping[str, Any] \ ]]): The table's schema. If any item is a mapping, its content must be compatible with :meth:`~google.cloud.bigquery.schema.SchemaField.from_api_repr`. Returns: List[:class:`~google.cloud.bigquery.Row`] """ from google.cloud.bigquery import Row from google.cloud.bigquery.schema import _to_schema_fields schema = _to_schema_fields(schema) field_to_index = _field_to_index_mapping(schema) return [Row(_row_tuple_from_json(r, schema), field_to_index) for r in values]
def _row_tuple_from_json(row, schema): """Convert JSON row data to row with appropriate types. Note: ``row['f']`` and ``schema`` are presumed to be of the same length. Args: row (Dict): A JSON response row to be converted. schema (Sequence[Union[ \ :class:`~google.cloud.bigquery.schema.SchemaField`, \ Mapping[str, Any] \ ]]): Specification of the field types in ``row``. Returns: Tuple: A tuple of data converted to native types. """ from google.cloud.bigquery.schema import _to_schema_fields schema = _to_schema_fields(schema) row_data = [] for field, cell in zip(schema, row["f"]): row_data.append(_field_from_json(cell["v"], field)) return tuple(row_data)
def dataframe_to_bq_schema(dataframe, bq_schema): """Convert a pandas DataFrame schema to a BigQuery schema. Args: dataframe (pandas.DataFrame): DataFrame for which the client determines the BigQuery schema. bq_schema (Sequence[Union[ \ :class:`~google.cloud.bigquery.schema.SchemaField`, \ Mapping[str, Any] \ ]]): A BigQuery schema. Use this argument to override the autodetected type for some or all of the DataFrame columns. Returns: Optional[Sequence[google.cloud.bigquery.schema.SchemaField]]: The automatically determined schema. Returns None if the type of any column cannot be determined. """ if bq_schema: bq_schema = schema._to_schema_fields(bq_schema) bq_schema_index = {field.name: field for field in bq_schema} bq_schema_unused = set(bq_schema_index.keys()) else: bq_schema_index = {} bq_schema_unused = set() bq_schema_out = [] unknown_type_fields = [] for column, dtype in list_columns_and_indexes(dataframe): # Use provided type from schema, if present. bq_field = bq_schema_index.get(column) if bq_field: bq_schema_out.append(bq_field) bq_schema_unused.discard(bq_field.name) continue # Otherwise, try to automatically determine the type based on the # pandas dtype. bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name) bq_field = schema.SchemaField(column, bq_type) bq_schema_out.append(bq_field) if bq_field.field_type is None: unknown_type_fields.append(bq_field) # Catch any schema mismatch. The developer explicitly asked to serialize a # column, but it was not found. if bq_schema_unused: raise ValueError( u"bq_schema contains fields not present in dataframe: {}".format( bq_schema_unused)) # If schema detection was not successful for all columns, also try with # pyarrow, if available. if unknown_type_fields: if not pyarrow: msg = u"Could not determine the type of columns: {}".format( ", ".join(field.name for field in unknown_type_fields)) warnings.warn(msg) return None # We cannot detect the schema in full. # The augment_schema() helper itself will also issue unknown type # warnings if detection still fails for any of the fields. bq_schema_out = augment_schema(dataframe, bq_schema_out) return tuple(bq_schema_out) if bq_schema_out else None
def _call_fut(schema): from google.cloud.bigquery.schema import _to_schema_fields return _to_schema_fields(schema)