def split_fast(self,
                   index_series,
                   proportion,
                   batch_size,
                   base_batch_size=1000):
        """Deterministically split a `DataFrame` into two `DataFrame`s.

    Note this split is only as deterministic as the underlying hash function;
    see `tf.string_to_hash_bucket_fast`.  The hash function is deterministic
    for a given binary, but may change occasionally.  The only way to achieve
    an absolute guarantee that the split `DataFrame`s do not change across runs
    is to materialize them.

    Note too that the allocation of a row to one partition or the
    other is evaluated independently for each row, so the exact number of rows
    in each partition is binomially distributed.

    Args:
      index_series: a `Series` of unique strings, whose hash will determine the
        partitioning; or the name in this `DataFrame` of such a `Series`.
        (This `Series` must contain strings because TensorFlow provides hash
        ops only for strings, and there are no number-to-string converter ops.)
      proportion: The proportion of the rows to select for the 'left'
        partition; the remaining (1 - proportion) rows form the 'right'
        partition.
      batch_size: the batch size to use when rebatching the left and right
        `DataFrame`s.  If None (default), the `DataFrame`s are not rebatched;
        thus their batches will have variable sizes, according to which rows
        are selected from each batch of the original `DataFrame`.
      base_batch_size: the batch size to use for materialized data, prior to the
        split.

    Returns:
      Two `DataFrame`s containing the partitioned rows.
    """
        if isinstance(index_series, str):
            index_series = self[index_series]
        left_mask, = split_mask.SplitMask(proportion)(index_series)
        right_mask = ~left_mask
        self["left_mask__"] = left_mask
        self["right_mask__"] = right_mask
        # TODO(soergel): instead of base_batch_size can we just do one big batch?
        # avoid computing the hashes twice
        m = self.materialize_to_memory(batch_size=base_batch_size)
        left_rows_df = m.select_rows(m["left_mask__"])
        right_rows_df = m.select_rows(m["right_mask__"])
        del left_rows_df[["left_mask__", "right_mask__"]]
        del right_rows_df[["left_mask__", "right_mask__"]]

        # avoid recomputing the split repeatedly
        left_rows_df = left_rows_df.materialize_to_memory(
            batch_size=batch_size)
        right_rows_df = right_rows_df.materialize_to_memory(
            batch_size=batch_size)
        return left_rows_df, right_rows_df
Exemple #2
0
  def split(self, index_series, proportion, batch_size=None):
    """Deterministically split a `DataFrame` into two `DataFrame`s.

    Note this split is only as deterministic as the underlying hash function;
    see `tf.string_to_hash_bucket_fast`.  The hash function is deterministic
    for a given binary, but may change occasionally.  The only way to achieve
    an absolute guarantee that the split `DataFrame`s do not change across runs
    is to materialize them.

    Note too that the allocation of a row to one partition or the
    other is evaluated independently for each row, so the exact number of rows
    in each partition is binomially distributed.

    Args:
      index_series: a `Series` of unique strings, whose hash will determine the
        partitioning; or the name in this `DataFrame` of such a `Series`.
        (This `Series` must contain strings because TensorFlow provides hash
        ops only for strings, and there are no number-to-string converter ops.)
      proportion: The proportion of the rows to select for the 'left'
        partition; the remaining (1 - proportion) rows form the 'right'
        partition.
      batch_size: the batch size to use when rebatching the left and right
        `DataFrame`s.  If None (default), the `DataFrame`s are not rebatched;
        thus their batches will have variable sizes, according to which rows
        are selected from each batch of the original `DataFrame`.

    Returns:
      Two `DataFrame`s containing the partitioned rows.
    """
    if isinstance(index_series, str):
      index_series = self[index_series]
    left_mask, = split_mask.SplitMask(proportion)(index_series)
    right_mask = ~left_mask
    left_rows = self.select_rows(left_mask)
    right_rows = self.select_rows(right_mask)

    if batch_size:
      left_rows = left_rows.batch(batch_size=batch_size, shuffle=False)
      right_rows = right_rows.batch(batch_size=batch_size, shuffle=False)

    return left_rows, right_rows