Beispiel #1
0
def generate_statistics_from_dataframe(
        dataframe: DataFrame,
        stats_options: options.StatsOptions = options.StatsOptions(),
        n_jobs: int = 1) -> statistics_pb2.DatasetFeatureStatisticsList:
    """Compute data statistics for the input pandas DataFrame.

  This is a utility method for users with in-memory data represented
  as a pandas DataFrame.

  Args:
    dataframe: Input pandas DataFrame.
    stats_options: `tfdv.StatsOptions` for generating data statistics.
    n_jobs: Number of processes to run (defaults to 1). If -1 is provided,
      uses the same number of processes as the number of CPU cores.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """
    if not isinstance(dataframe, DataFrame):
        raise TypeError('dataframe argument is of type {}. Must be a '
                        'pandas DataFrame.'.format(type(dataframe).__name__))

    stats_generators = cast(
        List[stats_generator.CombinerStatsGenerator],
        stats_impl.get_generators(stats_options, in_memory=True))
    if n_jobs < -1 or n_jobs == 0:
        raise ValueError('Invalid n_jobs parameter {}. Should be either '
                         ' -1 or >= 1.'.format(n_jobs))

    if n_jobs == -1:
        n_jobs = multiprocessing.cpu_count()
    n_jobs = max(min(n_jobs, multiprocessing.cpu_count()), 1)

    if n_jobs == 1:
        merged_partial_stats = _generate_partial_statistics_from_df(
            dataframe, stats_options, stats_generators)
    else:
        # TODO(pachristopher): Investigate why we don't observe linear speedup after
        # a certain number of processes.
        splits = np.array_split(dataframe, n_jobs)
        partial_stats = Parallel(n_jobs=n_jobs)(
            delayed(_generate_partial_statistics_from_df)(
                splits[i], stats_options, stats_generators)
            for i in range(n_jobs))
        merged_partial_stats = [
            gen.merge_accumulators(stats)
            for gen, stats in zip(stats_generators, zip(*partial_stats))
        ]
    return stats_impl.extract_statistics_output(merged_partial_stats,
                                                stats_generators)
Beispiel #2
0
def generate_statistics_from_dataframe(
    dataframe,
    stats_options = options.StatsOptions(),
    n_jobs = multiprocessing.cpu_count()
):
  """Compute data statistics for the input pandas DataFrame.

  This is a utility method for users with in-memory data represented
  as a pandas DataFrame.

  Args:
    dataframe: Input pandas DataFrame.
    stats_options: Options for generating data statistics.
    n_jobs: Number of processes to run.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """
  if not isinstance(dataframe, pd.DataFrame):
    raise TypeError('dataframe argument is of type {}. Must be a '
                    'pandas DataFrame.'.format(type(dataframe).__name__))

  stats_generators = stats_impl.get_generators(stats_options, in_memory=True)
  n_jobs = max(min(n_jobs, multiprocessing.cpu_count()), 1)
  if n_jobs == 1:
    merged_partial_stats = _generate_partial_statistics_from_df(
        dataframe, stats_options, stats_generators)
  else:
    # TODO(pachristopher): Investigate why we don't observe linear speedup after
    # a certain number of processes.
    splits = np.array_split(dataframe, n_jobs)
    partial_stats = Parallel(n_jobs=n_jobs)(
        delayed(_generate_partial_statistics_from_df)(
            splits[i], stats_options, stats_generators) for i in range(n_jobs))
    merged_partial_stats = [
        gen.merge_accumulators(stats)
        for gen, stats in zip(stats_generators, zip(*partial_stats))
    ]
  return stats_impl.extract_statistics_output(
      merged_partial_stats, stats_generators)