def transform(self, dataset: Dataset) -> Dataset: """Transforms the dataset by applying the graph of operators to it. Requires the ``fit`` method to have already been called, or calculated statistics to be loaded from disk This method returns a Dataset object, with the transformations lazily loaded. None of the actual computation will happen until the produced Dataset is consumed, or written out to disk. Parameters ----------- dataset: Dataset Returns ------- Dataset """ self._clear_worker_cache() if not self.output_schema: self.fit_schema(dataset.schema) ddf = dataset.to_ddf(columns=self._input_columns()) return Dataset( _transform_ddf(ddf, self.output_node, self.output_dtypes), client=self.client, cpu=dataset.cpu, base_dataset=dataset.base_dataset, schema=self.output_schema, )
def test_dataloader_epochs(datasets, engine, batch_size, epochs, on_ddf): dataset = Dataset(str(datasets["parquet"]), engine=engine) if on_ddf: dataset = dataset.to_ddf() cont_names = ["x", "y", "id"] cat_names = ["name-string", "name-cat"] label_name = ["label"] data_loader = DataLoader( dataset, cat_names=cat_names, cont_names=cont_names, batch_size=batch_size, label_names=label_name, shuffle=False, ) # Convert to iterators and then to DataFrames df1 = _concat(list(data_loader._buff.itr)) df2 = _concat(list(data_loader.epochs(epochs)._buff.itr)) # Check that the DataFrame sizes and rows make sense assert len(df2) == epochs * len(df1) assert_eq( _concat([df1 for i in range(epochs)]).reset_index(drop=True), df2.reset_index(drop=True), )
def fit(self, dataset: Dataset): """Calculates statistics for this workflow on the input dataset Parameters ----------- dataset: Dataset The input dataset to calculate statistics for. If there is a train/test split this data should be the training dataset only. """ self._clear_worker_cache() ddf = dataset.to_ddf(columns=self._input_columns()) # Get a dictionary mapping all StatOperators we need to fit to a set of any dependant # StatOperators (having StatOperators that depend on the output of other StatOperators # means that will have multiple phases in the fit cycle here) stat_ops = {op: _get_stat_ops(op.parents) for op in _get_stat_ops([self.column_group])} while stat_ops: # get all the StatOperators that we can currently call fit on (no outstanding # dependencies) current_phase = [op for op, dependencies in stat_ops.items() if not dependencies] if not current_phase: # this shouldn't happen, but lets not infinite loop just in case raise RuntimeError("failed to find dependency-free StatOperator to fit") stats, ops = [], [] for column_group in current_phase: # apply transforms necessary for the inputs to the current column group, ignoring # the transforms from the statop itself transformed_ddf = _transform_ddf(ddf, column_group.parents) op = column_group.op try: stats.append(op.fit(column_group.input_column_names, transformed_ddf)) ops.append(op) except Exception: LOG.exception("Failed to fit operator %s", column_group.op) raise if self.client: results = [r.result() for r in self.client.compute(stats)] else: results = dask.compute(stats, scheduler="synchronous")[0] for computed_stats, op in zip(results, ops): op.fit_finalize(computed_stats) # Remove all the operators we processed in this phase, and remove # from the dependencies of other ops too for stat_op in current_phase: stat_ops.pop(stat_op) for dependencies in stat_ops.values(): dependencies.difference_update(current_phase)
def transform(self, dataset: Dataset) -> Dataset: """Transforms the dataset by applying the graph of operators to it. Requires the 'fit' method to have already been called, or calculated statistics to be loaded from disk This method returns a Dataset object, with the transformations lazily loaded. None of the actual computation will happen until the produced Dataset is consumed, or written out to disk. Parameters ----------- dataset: Dataset Returns ------- Dataset """ self._clear_worker_cache() ddf = dataset.to_ddf(columns=self._input_columns()) return Dataset(_transform_ddf(ddf, self.column_group), client=self.client)
def fit(self, dataset: Dataset) -> "Workflow": """Calculates statistics for this workflow on the input dataset Parameters ----------- dataset: Dataset The input dataset to calculate statistics for. If there is a train/test split this data should be the training dataset only. """ self._clear_worker_cache() if not self.output_schema: self.fit_schema(dataset.schema) ddf = dataset.to_ddf(columns=self._input_columns()) # Get a dictionary mapping all StatOperators we need to fit to a set of any dependent # StatOperators (having StatOperators that depend on the output of other StatOperators # means that will have multiple phases in the fit cycle here) stat_ops = { op: _get_stat_ops(op.parents_with_dependencies) for op in _get_stat_ops([self.output_node]) } while stat_ops: # get all the StatOperators that we can currently call fit on (no outstanding # dependencies) current_phase = [op for op, dependencies in stat_ops.items() if not dependencies] if not current_phase: # this shouldn't happen, but lets not infinite loop just in case raise RuntimeError("failed to find dependency-free StatOperator to fit") stats, ops = [], [] for workflow_node in current_phase: # Check for additional input columns that aren't generated by parents addl_input_cols = set() if workflow_node.parents: upstream_output_cols = sum( [ upstream.output_columns for upstream in workflow_node.parents_with_dependencies ], nvtabular.ColumnSelector(), ) addl_input_cols = set(workflow_node.input_columns.names) - set( upstream_output_cols.names ) # apply transforms necessary for the inputs to the current column group, ignoring # the transforms from the statop itself transformed_ddf = _ensure_optimize_dataframe_graph( ddf=_transform_ddf( ddf, workflow_node.parents_with_dependencies, additional_columns=addl_input_cols, ) ) op = workflow_node.op try: stats.append(op.fit(workflow_node.input_columns, transformed_ddf)) ops.append(op) except Exception: LOG.exception("Failed to fit operator %s", workflow_node.op) raise if self.client: results = [r.result() for r in self.client.compute(stats)] else: results = dask.compute(stats, scheduler="synchronous")[0] for computed_stats, op in zip(results, ops): op.fit_finalize(computed_stats) # Remove all the operators we processed in this phase, and remove # from the dependencies of other ops too for stat_op in current_phase: stat_ops.pop(stat_op) for dependencies in stat_ops.values(): dependencies.difference_update(current_phase) # hack: store input/output dtypes here. We should have complete dtype # information for each operator (like we do for column names), but as # an interim solution this gets us what we need. input_dtypes = dataset.to_ddf()[self._input_columns()].dtypes self.input_dtypes = dict(zip(input_dtypes.index, input_dtypes)) output_dtypes = self.transform(dataset).sample_dtypes() self.output_dtypes = dict(zip(output_dtypes.index, output_dtypes)) self._zero_output_schemas() self.fit_schema(dataset.schema) return self