def _split_into_per_realization_tables(table: pa.Table) -> Dict[int, pa.Table]:
    per_real_tables: Dict[int, pa.Table] = {}
    unique_reals = table.column("REAL").unique().to_pylist()
    for real in unique_reals:
        # pylint: disable=no-member
        mask = pc.is_in(table["REAL"], value_set=pa.array([real]))
        real_table = table.filter(mask).drop(["REAL"])
        per_real_tables[real] = real_table

    return per_real_tables
def render_arrow_v1(table: pa.Table, params, **kwargs):
    todo = frozenset(params["colnames"])

    for i, colname in enumerate(table.column_names):
        if colname not in todo:
            continue

        table = table.set_column(
            i, colname,
            format_chunked_array(table.column(i), table.schema.field(i)))

    return ArrowRenderResult(table)
Exemple #3
0
def _filter_features(table: pa.Table,
                     feature_whitelist: List[types.FeatureName]) -> pa.Table:
    """Removes features that are not whitelisted.

  Args:
    table: Input Arrow table.
    feature_whitelist: A set of feature names to whitelist.

  Returns:
    An Arrow table containing only the whitelisted features of the input table.
  """
    column_names = set(table.schema.names)
    columns_to_select = []
    column_names_to_select = []
    for feature_name in feature_whitelist:
        if feature_name in column_names:
            columns_to_select.append(table.column(feature_name).data)
            column_names_to_select.append(feature_name)
    return pa.Table.from_arrays(columns_to_select, column_names_to_select)
def get_weight_feature(input_table: pa.Table,
                       weight_column: Text) -> np.ndarray:
    """Gets the weight column from the input table.

  Args:
    input_table: Input table.
    weight_column: Name of the column containing the weight.

  Returns:
    A numpy array containing the weights of the examples in the input table.

  Raises:
    ValueError: If the weight feature is not present in the input table or is
        not a valid weight feature (must be of numeric type and have a
        single value for each example).
  """
    try:
        weights = input_table.column(weight_column).data.chunk(0)
    except KeyError:
        raise ValueError('Weight column "{}" not present in the input '
                         'table.'.format(weight_column))

    if pa.types.is_null(weights.type):
        raise ValueError(
            'Weight column "{}" cannot be null.'.format(weight_column))
    # Before flattening, check that there is a single value for each example.
    weight_lengths = array_util.ListLengthsFromListArray(weights).to_numpy()
    if not np.all(weight_lengths == 1):
        raise ValueError(
            'Weight column "{}" must have exactly one value in each example.'.
            format(weight_column))
    flat_weights = weights.flatten()
    # Before converting to numpy view, check the type (cannot convert string and
    # binary arrays to numpy view).
    flat_weights_type = flat_weights.type
    if (not pa.types.is_floating(flat_weights_type)
            and not pa.types.is_integer(flat_weights_type)):
        raise ValueError(
            'Weight column "{}" must be of numeric type. Found {}.'.format(
                weight_column, flat_weights_type))
    return np.asarray(flat_weights)
def find_intersected_dates_between_realizations(table: pa.Table) -> np.ndarray:
    """Find the intersection of dates present in all the realizations
    The input table must contain both REAL and DATE columns, but this function makes
    no assumptions about sorting of either column"""

    unique_reals = table.column("REAL").unique().to_numpy()

    date_intersection = None
    for real in unique_reals:
        # pylint: disable=no-member
        real_mask = pc.is_in(table["REAL"], value_set=pa.array([real]))
        dates_in_real = table.filter(real_mask).column(
            "DATE").unique().to_numpy()
        if date_intersection is None:
            date_intersection = dates_in_real
        else:
            date_intersection = np.intersect1d(date_intersection,
                                               dates_in_real,
                                               assume_unique=True)

    if date_intersection is not None:
        return date_intersection

    return np.empty(0, dtype=np.datetime64)
Exemple #6
0
    def _format_outputs(cls, table: pa.Table) -> pa.Table:

        for column_index in range(table.num_columns):

            column: pa.Array = table.column(column_index)
            format_applied = False

            if pa.types.is_floating(column.type):

                # Arrow outputs NaN as null
                # If a float column contains NaN, use our own formatter to distinguish between them

                has_nan = pac.any(
                    pac.and_not(  # noqa
                        column.is_null(nan_is_null=True),
                        column.is_null(nan_is_null=False)))

                if has_nan.as_py():
                    column = cls._format_float(column)
                    format_applied = True

            if pa.types.is_decimal(column.type):
                column = cls._format_decimal(column)
                format_applied = True

            if pa.types.is_timestamp(column.type):
                column = cls._format_timestamp(column)
                format_applied = True

            if format_applied:
                field = pa.field(table.schema.names[column_index], pa.utf8())
                table = table \
                    .remove_column(column_index) \
                    .add_column(column_index, field, column)

        return table
def add_brand(table: pa.Table) -> pa.Table:
    is_jp = table.column('app_id').to_pandas().str.endswith(
        'jyllands-posten.dk')
    brand = pd.Categorical(np.where(is_jp, 'jp', 'erhvervsmedier'))
    # noinspection PyCallByClass,PyTypeChecker
    return table.append_column(pa.Column.from_array('brand', brand))
Exemple #8
0
def get_array(
        table: pa.Table, query_path: types.FeaturePath,
        return_example_indices: bool) -> Tuple[pa.Array, Optional[np.ndarray]]:
    """Retrieve a nested array (and optionally example indices) from a table.

  It assumes all the columns in `table` have only one chunk.
  It assumes `table` contains only arrays of the following supported types:
    - list<primitive>
    - list<struct<[Ts]>> where Ts are the types of the fields in the struct
      type, and they can only be one of the supported types
      (recursion intended).

  If the provided path refers to a leaf in the table, then a ListArray with a
  primitive element type will be returned. If the provided path does not refer
  to a leaf, a ListArray with a StructArray element type will be returned.

  Args:
    table: The Table whose arrays to be visited. It is assumed that the table
      contains only one chunk.
    query_path: The FeaturePath to lookup in the table.
    return_example_indices: Whether to return an additional array containing the
      example indices of the elements in the array corresponding to the
      query_path.

  Returns:
    A tuple. The first term is the feature array and the second term is the
    example_indeices array for the feature array (i.e. array[i] came from the
    example at row example_indices[i] in the table.).

  Raises:
    KeyError: When the query_path is empty, or cannot be found in the table and
      its nested struct arrays.
  """
    def _recursion_helper(
        query_path: types.FeaturePath, array: pa.Array,
        example_indices: Optional[np.ndarray]
    ) -> Tuple[pa.Array, Optional[np.ndarray]]:
        """Recursion helper."""
        if not query_path:
            return array, example_indices
        array_type = array.type
        if (not is_list_like(array_type)
                or not pa.types.is_struct(array_type.value_type)):
            raise KeyError(
                'Cannot process query_path "{}" inside an array of type '
                '{}. Expecting a (large_)list<struct<...>>.'.format(
                    query_path, array_type))
        flat_struct_array = array.flatten()
        flat_indices = None
        if example_indices is not None:
            flat_indices = example_indices[
                array_util.GetFlattenedArrayParentIndices(array).to_numpy()]

        step = query_path.steps()[0]
        try:
            child_array = flat_struct_array.field(step)
        except KeyError:
            raise KeyError('query_path step "{}" not in struct.'.format(step))
        relative_path = types.FeaturePath(query_path.steps()[1:])
        return _recursion_helper(relative_path, child_array, flat_indices)

    if not query_path:
        raise KeyError('query_path must be non-empty.')
    column_name = query_path.steps()[0]
    try:
        array = table.column(column_name).data.chunk(0)
    except KeyError:
        raise KeyError(
            'query_path step 0 "{}" not in table.'.format(column_name))
    array_path = types.FeaturePath(query_path.steps()[1:])

    example_indices = np.arange(
        table.num_rows) if return_example_indices else None
    return _recursion_helper(array_path, array, example_indices)
Exemple #9
0
 def extract_column(self, pa_table: pa.Table) -> list:
     return pa_table.column(0).to_pylist()
Exemple #10
0
def sample_segmented_multi_real_table_at_date(
    table: pa.Table, np_datetime: np.datetime64
) -> pa.Table:
    """Sample table containing multiple realizations at the specified date.
    The table must contain both a REAL and a DATE column.
    The table must be segmented on REAL (so that all rows from a single
    realization are contiguous) and within each REAL segment, it must be
    sorted on DATE.
    """
    # pylint: disable=too-many-locals

    unique_reals_arr_np, first_occurrence_idx, real_counts = np.unique(
        table.column("REAL").to_numpy(), return_index=True, return_counts=True
    )

    all_dates_arr_np = table.column("DATE").to_numpy()

    # Will receive row indices into the full input table for the two values we should
    # interpolate/blend between.
    # To keep things simple we always add two indices for each realization even if
    # we know that no interpolation will be needed (e.g. exact matches)
    row_indices = []

    # Will receive the blending weights for doing interpolation
    interpolate_t_arr = np.zeros(len(unique_reals_arr_np))

    # Array with mask for selecting values when doing backfill. A value of 1 will select
    # v1, while a value of 0 will yield a 0 value
    backfill_mask_arr = np.ones(len(unique_reals_arr_np))

    for i, _real in enumerate(unique_reals_arr_np):
        # Starting row of this realization and number of rows belonging to realization
        start_row_idx = first_occurrence_idx[i]
        row_count = real_counts[i]

        # Get slice of the dates for just this realization
        dates_arr_np = all_dates_arr_np[start_row_idx : start_row_idx + row_count]
        assert len(dates_arr_np) > 0

        # OUTSIDE RANGE (query date is before our first date)
        if np_datetime < dates_arr_np[0]:
            row_indices.append(start_row_idx)
            row_indices.append(start_row_idx)
            # Extrapolate or just fill with 0 for rates
            # interpolate_t_arr[i] = 0
            backfill_mask_arr[i] = 0

        # OUTSIDE RANGE (query date is beyond our last date)
        elif np_datetime > dates_arr_np[-1]:
            row_indices.append(start_row_idx + row_count - 1)
            row_indices.append(start_row_idx + row_count - 1)
            # Extrapolate or just fill with 0 for rates. For interpolation, t should
            # really 1, but since the rows are duplicated it does not matter
            # interpolate_t_arr[i] = 0
            backfill_mask_arr[i] = 0

        # EXACT MATCH on the LAST DATE
        elif np_datetime == dates_arr_np[-1]:
            row_indices.append(start_row_idx + row_count - 1)
            row_indices.append(start_row_idx + row_count - 1)
            # interpolate_t_arr[i] = 0
            # backfill_mask_arr[i] = 1

        else:
            # Search for query date amongst the realization's dates.
            # last_insertion_index is the last legal insertion index of the queried value
            last_insertion_index: int = np.searchsorted(
                dates_arr_np, np_datetime, side="right"
            ).item()

            assert 0 < last_insertion_index < len(dates_arr_np)
            assert dates_arr_np[last_insertion_index - 1] <= np_datetime
            assert dates_arr_np[last_insertion_index] > np_datetime

            if dates_arr_np[last_insertion_index - 1] == np_datetime:
                # Exact match
                row_indices.append(start_row_idx + last_insertion_index - 1)
                row_indices.append(start_row_idx + last_insertion_index - 1)
                # interpolate_t_arr[i] = 0
                # backfill_mask_arr[i] = 1
            else:
                row_indices.append(start_row_idx + last_insertion_index - 1)
                row_indices.append(start_row_idx + last_insertion_index)
                interpolate_t_arr[i] = _compute_interpolation_weight(
                    np_datetime,
                    dates_arr_np[last_insertion_index - 1],
                    dates_arr_np[last_insertion_index],
                )
                # backfill_mask_arr[i] = 1

    column_arrays = []

    for colname in table.schema.names:
        if colname == "REAL":
            column_arrays.append(unique_reals_arr_np)
        elif colname == "DATE":
            column_arrays.append(np.full(len(unique_reals_arr_np), np_datetime))
        else:
            records_np = table.column(colname).take(row_indices).to_numpy()
            if is_rate_from_field_meta(table.field(colname)):
                v1_arr = records_np[1::2]
                interpolated_vec_values = v1_arr * backfill_mask_arr
            else:
                v0_arr = records_np[0::2]
                v1_arr = records_np[1::2]
                delta_arr = v1_arr - v0_arr
                interpolated_vec_values = v0_arr + (delta_arr * interpolate_t_arr)

            column_arrays.append(pa.array(interpolated_vec_values))

    ret_table = pa.table(column_arrays, schema=table.schema)

    return ret_table
Exemple #11
0
def resample_segmented_multi_real_table(table: pa.Table, freq: Frequency) -> pa.Table:
    """Resample table containing multiple realizations.
    The table must contain both a REAL and a DATE column.
    The table must be segmented on REAL (so that all rows from a single
    realization are contiguous) and within each REAL segment, it must be
    sorted on DATE.
    The segmentation is needed since interpolations must be done per realization
    and we utilize slicing on rows for speed.
    """
    # pylint: disable=too-many-locals

    real_arr_np = table.column("REAL").to_numpy()
    unique_reals, first_occurrence_idx, real_counts = np.unique(
        real_arr_np, return_index=True, return_counts=True
    )

    output_columns_dict: Dict[str, pa.ChunkedArray] = {}

    real_interpolation_info_dict: Dict[int, RealInterpolationInfo] = {}

    for colname in table.schema.names:
        if colname in ["DATE", "REAL"]:
            continue

        is_rate = is_rate_from_field_meta(table.field(colname))
        raw_whole_numpy_arr = table.column(colname).to_numpy()

        vec_arr_list = []
        for i, real in enumerate(unique_reals):
            start_row_idx = first_occurrence_idx[i]
            row_count = real_counts[i]

            rii = real_interpolation_info_dict.get(real)
            if not rii:
                rii = _extract_real_interpolation_info(
                    table, start_row_idx, row_count, freq
                )
                real_interpolation_info_dict[real] = rii

            raw_numpy_arr = raw_whole_numpy_arr[
                start_row_idx : start_row_idx + row_count
            ]

            if is_rate:
                inter = interpolate_backfill(
                    rii.sample_dates_np_as_uint,
                    rii.raw_dates_np_as_uint,
                    raw_numpy_arr,
                    0,
                    0,
                )
            else:
                inter = np.interp(
                    rii.sample_dates_np_as_uint,
                    rii.raw_dates_np_as_uint,
                    raw_numpy_arr,
                )

            arr_length = len(rii.sample_dates_np_as_uint)
            assert arr_length == len(inter)

            vec_arr_list.append(inter)

        output_columns_dict[colname] = pa.chunked_array(vec_arr_list)

    date_arr_list = []
    real_arr_list = []
    for real in unique_reals:
        rii = real_interpolation_info_dict[real]
        arr_length = len(rii.sample_dates_np)
        date_arr_list.append(rii.sample_dates_np)
        real_arr_list.append(np.full(arr_length, real))

    output_columns_dict["DATE"] = pa.chunked_array(date_arr_list)
    output_columns_dict["REAL"] = pa.chunked_array(real_arr_list)

    ret_table = pa.table(output_columns_dict, schema=table.schema)

    return ret_table
Exemple #12
0
 def extract_column(self, pa_table: pa.Table) -> pa.Array:
     return pa_table.column(0)
def _is_date_column_monotonically_increasing(table: pa.Table) -> bool:
    dates_np = table.column("DATE").to_numpy()
    if not np.all(np.diff(dates_np) > np.timedelta64(0)):
        return False

    return True
Exemple #14
0
def _postprocess_name_columns(
        table: pyarrow.Table,
        has_header: bool) -> Tuple[pyarrow.Table, List[ParseCsvWarning]]:
    """
    Return `table`, with final column names but still String values.
    """
    warnings = []
    if has_header and table.num_rows > 0:
        n_ascii_cleaned = 0
        first_ascii_cleaned = None
        n_truncated = 0
        first_truncated = None
        n_numbered = 0
        first_numbered = None

        names = []
        for colname in gen_unique_clean_colnames(
                list(("" if c[0] is pyarrow.NULL else c[0].as_py())
                     for c in table.columns),
                settings=settings,
        ):
            names.append(colname.name)
            if colname.is_ascii_cleaned:
                if n_ascii_cleaned == 0:
                    first_ascii_cleaned = colname.name
                n_ascii_cleaned += 1
            if colname.is_truncated:
                if n_truncated == 0:
                    first_truncated = colname.name
                n_truncated += 1
            if colname.is_numbered:
                if n_numbered == 0:
                    first_numbered = colname.name
                n_numbered += 1
            # Unicode can't be fixed, because we assume valid UTF-8 input
            assert not colname.is_unicode_fixed
            # Stay silent if colname.is_default. Users expect us to
            # auto-generate default column names.

        if n_ascii_cleaned:
            warnings.append(
                ParseCsvWarning.CleanedAsciiColumnNames(
                    n_ascii_cleaned, first_ascii_cleaned))
        if n_truncated:
            warnings.append(
                ParseCsvWarning.TruncatedColumnNames(n_truncated,
                                                     first_truncated))
        if n_numbered:
            warnings.append(
                ParseCsvWarning.NumberedColumnNames(n_numbered,
                                                    first_numbered))

        # Remove header (zero-copy: builds new pa.Table with same backing data)
        table = table.slice(1)
    else:
        names = [f"Column {i + 1}" for i in range(len(table.columns))]

    return (
        pyarrow.table({name: table.column(i)
                       for i, name in enumerate(names)}),
        warnings,
    )
def _is_date_column_sorted(table: pa.Table) -> bool:
    dates_np = table.column("DATE").to_numpy()
    if not np.all(np.diff(dates_np) > np.timedelta64(0)):
        return False

    return True
Exemple #16
0
    def conform_to_schema(
            cls, table: pa.Table, schema: pa.Schema,
            pandas_types=None, warn_extra_columns=True) \
            -> pa.Table:
        """
        Align an Arrow table to an Arrow schema.

        Columns will be matched using case-insensitive matching and columns not in the schema will be dropped.
        The resulting table will have the field order and case defined in the schema.

        Where column types do not match exactly, type coercion will be applied if possible.
        In some cases type coercion may result in overflows,
        for example casting int64 -> int32 will fail if any values are greater than the maximum int32 value.

        If the incoming data has been converted from Pandas, there are some conversions that can be applied
        if the original Pandas dtype is known. These dtypes can be supplied via the pandas_dtypes parameter
        and should line up with the data in the table (i.e. dtypes are for the source data, not the target schema).

        The method will return a dataset whose schema exactly matches the requested schema.
        If it is not possible to make the data conform to the schema for any reason, EDataConformance will be raised.

        :param table: The data to be conformed
        :param schema: The schema to conform to
        :param pandas_types: Pandas dtypes for the table, if the table has been converted from Pandas
        :param warn_extra_columns: Whether to log warnings it the table contains columns not in the schema
        :return: The conformed data, whose schema will exactly match the supplied schema parameter
        :raises: _ex.EDataConformance if conformance is not possible for any reason
        """

        # If Pandas types are supplied they must match the table, i.e. table has been converted from Pandas
        if pandas_types is not None and len(pandas_types) != len(
                table.schema.types):
            raise _ex.EUnexpected()

        cls._check_duplicate_fields(schema, True)
        cls._check_duplicate_fields(table.schema, False)

        table_indices = {
            f.lower(): i
            for (i, f) in enumerate(table.schema.names)
        }
        conformed_data = []
        conformance_errors = []

        # Coerce types to match expected schema where possible
        for schema_index in range(len(schema.names)):

            try:
                schema_field = schema.field(schema_index)
                table_index = table_indices.get(schema_field.name.lower())

                if table_index is None:
                    message = cls.__E_FIELD_MISSING.format(
                        field_name=schema_field.name)
                    cls.__log.error(message)
                    raise _ex.EDataConformance(message)

                table_column: pa.Array = table.column(table_index)

                pandas_type = pandas_types[table_index] \
                    if pandas_types is not None \
                    else None

                if table_column.type == schema_field.type:
                    conformed_column = table_column
                else:
                    conformed_column = cls._coerce_vector(
                        table_column, schema_field, pandas_type)

                if not schema_field.nullable and table_column.null_count > 0:
                    message = f"Null values present in non-null field [{schema_field.name}]"
                    cls.__log.error(message)
                    raise _ex.EDataConformance(message)

                conformed_data.append(conformed_column)

            except _ex.EDataConformance as e:
                conformance_errors.append(e)

        # Columns not defined in the schema will not be included in the conformed output
        if warn_extra_columns and table.num_columns > len(schema.types):

            schema_columns = set(map(str.lower, schema.names))
            extra_columns = [
                f"[{col}]" for col in table.schema.names
                if col.lower() not in schema_columns
            ]

            message = f"Columns not defined in the schema will be dropped: {', '.join(extra_columns)}"
            cls.__log.warning(message)

        if any(conformance_errors):
            if len(conformance_errors) == 1:
                raise conformance_errors[0]
            else:
                cls.__log.error("There were multiple data conformance errors")
                raise _ex.EDataConformance(
                    "There were multiple data conformance errors",
                    conformance_errors)

        return pa.Table.from_arrays(conformed_data, schema=schema)  # noqa