def row_to_sample(row, schema, feature_cols, label_cols): from bigdl.util.common import Sample if label_cols: feature, label = convert_row_to_numpy(row, schema, feature_cols, label_cols) sample = Sample.from_ndarray(feature, label) else: feature, = convert_row_to_numpy(row, schema, feature_cols, label_cols) sample = Sample.from_ndarray(feature, np.array([0.0])) return sample
def _dataframe_to_xshards(data, feature_cols, label_cols=None): schema = data.schema numpy_rdd = data.rdd.map(lambda row: convert_row_to_numpy( row, schema, feature_cols, label_cols)) shard_rdd = numpy_rdd.mapPartitions( lambda x: arrays2dict(x, feature_cols, label_cols)) return SparkXShards(shard_rdd)
def _dataframe_to_xshards(data, feature_cols, label_cols=None): from zoo.orca import OrcaContext schema = data.schema shard_size = OrcaContext._shard_size numpy_rdd = data.rdd.map(lambda row: convert_row_to_numpy( row, schema, feature_cols, label_cols)) shard_rdd = numpy_rdd.mapPartitions( lambda x: arrays2dict(x, feature_cols, label_cols, shard_size)) return SparkXShards(shard_rdd)