def test_transform_with_repartition(self): # shards of pandas dataframe file_path = os.path.join(self.resource_path, "orca/data/csv") data_shard = bigdl.orca.data.pandas.read_csv(file_path) partitions = data_shard.rdd.glom().collect() for par in partitions: assert len(par) <= 1 def negative(df, column_name): df[column_name] = df[column_name] * (-1) return df shard2 = data_shard.transform_shard(negative, "sale_price") shard3 = shard2.repartition(4) partitions3 = shard3.rdd.glom().collect() for par in partitions3: assert len(par) <= 1 shard4 = shard2.repartition(1) partitions4 = shard4.rdd.glom().collect() for par in partitions4: assert len(par) <= 1 shard5 = shard4.transform_shard(negative, "sale_price") partitions5 = shard5.rdd.glom().collect() for par in partitions5: assert len(par) <= 1 # shards of list data = [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]] sc = init_nncontext() rdd = sc.parallelize(data) data_shard = SparkXShards(rdd) shard2 = data_shard.repartition(6) partitions2 = shard2.rdd.glom().collect() for par in partitions2: assert len(par) <= 1 shard3 = data_shard.repartition(1) partitions2 = shard3.rdd.glom().collect() for par in partitions2: assert len(par) <= 1 # shards of numpy array data = [ np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]), np.array([9, 10, 11, 12]), np.array([13, 14, 15, 16]) ] sc = init_nncontext() rdd = sc.parallelize(data) data_shard = SparkXShards(rdd) shard2 = data_shard.repartition(6) partitions2 = shard2.rdd.glom().collect() for par in partitions2: assert len(par) <= 1 shard3 = data_shard.repartition(1) partitions2 = shard3.rdd.glom().collect() for par in partitions2: assert len(par) <= 1
def test_xshards_predict_save_load(self): sc = init_nncontext() rdd = sc.range(0, 110).map(lambda x: np.array([x] * 50)) shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map( lambda x: {"x": np.stack(x)}) shards = SparkXShards(shards) estimator = get_estimator(workers_per_node=2, model_fn=lambda config: IdentityNet()) result_shards = estimator.predict(shards, batch_size=4) result_before = np.concatenate( [shard["prediction"] for shard in result_shards.collect()]) expected_result = np.concatenate( [shard["x"] for shard in result_shards.collect()]) assert np.array_equal(result_before, expected_result) path = "/tmp/model.pth" try: estimator.save(path) estimator.load(path) result_shards = estimator.predict(shards, batch_size=4) result_after = np.concatenate( [shard["prediction"] for shard in result_shards.collect()]) finally: os.remove(path) assert np.array_equal(result_before, result_after)
def get_pred_xshards(key): rdd = self.sc.range(0, 110).map(lambda x: np.array([x] * 50)) shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map( lambda x: { key: np.stack(x) }).map(lambda x: {key: [x[key][:, :24], x[key][:, 24:]]}) shards = SparkXShards(shards) return shards
def test_spark_xshards(self): from bigdl.dllib.nncontext import init_nncontext from bigdl.orca.data import SparkXShards estimator = get_estimator(workers_per_node=1) sc = init_nncontext() x_rdd = sc.parallelize(np.random.rand(4000, 1, 50).astype(np.float32)) # torch 1.7.1+ requires target size same as output size, which is (batch, 1) y_rdd = sc.parallelize( np.random.randint(0, 2, size=(4000, 1, 1)).astype(np.float32)) rdd = x_rdd.zip(y_rdd).map(lambda x_y: {'x': x_y[0], 'y': x_y[1]}) train_rdd, val_rdd = rdd.randomSplit([0.9, 0.1]) train_xshards = SparkXShards(train_rdd) val_xshards = SparkXShards(val_rdd) train_stats = estimator.fit(train_xshards, batch_size=256, epochs=2) print(train_stats) val_stats = estimator.evaluate(val_xshards, batch_size=128) print(val_stats)
def update_predict_xshards(xshard, pred_xshards): def updates(d1_d2): d1, d2 = d1_d2 d1.update(d2) return d1 result = SparkXShards(xshard.rdd.zip(pred_xshards.rdd).map(updates)) return result
def _predict_spark_xshards(self, xshards, init_params, params): def transform_func(iter, init_param, param): partition_data = list(iter) # res = combine_in_partition(partition_data) param["data_creator"] = make_data_creator(partition_data) return PytorchPysparkWorker(**init_param).predict(**params) pred_shards = SparkXShards( xshards.rdd.mapPartitions( lambda iter: transform_func(iter, init_params, params))) return pred_shards
def test_zip(self): def negative(df, column_name, minus_val): df[column_name] = df[column_name] * (-1) df[column_name] = df[column_name] - minus_val return df file_path = os.path.join(self.resource_path, "orca/data/json") data_shard = bigdl.orca.data.pandas.read_json(file_path, orient='columns', lines=True) data_shard = data_shard.repartition(2) data_shard.cache() transformed_shard = data_shard.transform_shard(negative, "value", 2) zipped_shard = data_shard.zip(transformed_shard) assert not transformed_shard.is_cached( ), "transformed_shard should be uncached." data = zipped_shard.collect() assert data[0][0]["value"].values[0] + data[0][1]["value"].values[0] == -2, \ "value should be -2" list1 = list([1, 2, 3]) with self.assertRaises(Exception) as context: data_shard.zip(list1) self.assertTrue( 'other should be a SparkXShards' in str(context.exception)) transformed_shard = transformed_shard.repartition( data_shard.num_partitions() - 1) with self.assertRaises(Exception) as context: data_shard.zip(transformed_shard) self.assertTrue( 'The two SparkXShards should have the same number of partitions' in str(context.exception)) dict_data = [{"x": 1, "y": 2}, {"x": 2, "y": 3}] sc = init_nncontext() rdd = sc.parallelize(dict_data) dict_shard = SparkXShards(rdd) dict_shard = dict_shard.repartition(1) with self.assertRaises(Exception) as context: transformed_shard.zip(dict_shard) self.assertTrue( 'The two SparkXShards should have the same number of elements in ' 'each partition' in str(context.exception))
def test_openvino_predict_xshards(self): self.load_resnet() input_data_list = [np.array([self.input] * 4), np.concatenate([np.array([self.input] * 2), np.zeros([1, 3, 224, 224])])] sc = init_nncontext() rdd = sc.parallelize(input_data_list, numSlices=2) shards = SparkXShards(rdd) def pre_processing(images): return {"x": images} shards = shards.transform_shard(pre_processing) result = self.est.predict(shards) result_c = result.collect() assert isinstance(result, SparkXShards) assert result_c[0]["prediction"].shape == (4, 1000) assert result_c[1]["prediction"].shape == (3, 1000) assert self.check_result(result_c[0]["prediction"], 4) assert self.check_result(result_c[1]["prediction"], 2) assert not self.check_result(result_c[1]["prediction"][2:], 1)
def test_convert_predict_rdd_to_xshard(self): rdd = self.sc.range(0, 110).map(lambda x: np.array([x] * 50)) shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map( lambda x: {"x": np.stack(x)}) shards = SparkXShards(shards) pred_rdd = self.sc.range(0, 110).map(lambda x: np.array([x] * 50)) result_shards = convert_predict_rdd_to_xshard(shards, pred_rdd) result = np.concatenate( [shard["prediction"] for shard in result_shards.collect()]) expected_result = np.concatenate( [shard["x"] for shard in result_shards.collect()]) assert np.array_equal(result, expected_result)
def _dataframe_to_xshards(data, feature_cols, label_cols=None, accept_str_col=False): from bigdl.orca import OrcaContext schema = data.schema shard_size = OrcaContext._shard_size numpy_rdd = data.rdd.map(lambda row: convert_row_to_numpy(row, schema, feature_cols, label_cols, accept_str_col)) shard_rdd = numpy_rdd.mapPartitions(lambda x: arrays2dict(x, feature_cols, label_cols, shard_size)) return SparkXShards(shard_rdd)
def test_xshards_predict(self): sc = init_nncontext() rdd = sc.range(0, 110).map(lambda x: np.array([x] * 50)) shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map( lambda x: {"x": np.stack(x)}) shards = SparkXShards(shards) estimator = get_estimator(workers_per_node=2, model_fn=lambda config: IdentityNet()) result_shards = estimator.predict(shards, batch_size=4) result = np.concatenate( [shard["prediction"] for shard in result_shards.collect()]) expected_result = np.concatenate( [shard["x"] for shard in result_shards.collect()]) assert np.array_equal(result, expected_result)
def convert_predict_rdd_to_xshard(data, prediction_rdd): import numpy as np from bigdl.orca.data import SparkXShards def group_index(iter): for data in iter: size = get_size(data["x"]) for i in range(size): yield size def transform_predict(predictions): # list of np array if isinstance(predictions[0], list): predictions = np.array(predictions).T.tolist() result = [np.array(predict) for predict in predictions] return result # np array else: return np.array(predictions) def group(iter): this_index = 0 buffer = [] this_count = None for (count, pred) in iter: if this_index == 0: this_count = count if this_index < this_count: buffer.append(pred) this_index += 1 if this_index == this_count: yield transform_predict(buffer) buffer.clear() this_index = 0 def add_pred(shard_pred): shard, pred = shard_pred shard["prediction"] = pred return shard indexed_rdd = data.rdd.mapPartitions(group_index) grouped_pred = indexed_rdd.zip(prediction_rdd).mapPartitions(group) result_rdd = data.rdd.zip(grouped_pred).map(add_pred) return SparkXShards(result_rdd)
def to_spark_xshards(self): from bigdl.orca.data import SparkXShards ray_ctx = RayContext.get() sc = ray_ctx.sc address = ray_ctx.redis_address password = ray_ctx.redis_password num_parts = self.num_partitions() partition2store = self.partition2store_name rdd = self.rdd.mapPartitionsWithIndex(lambda idx, _: get_from_ray( idx, address, password, partition2store)) # the reason why we trigger computation here is to ensure we get the data # from ray before the RayXShards goes out of scope and the data get garbage collected from pyspark.storagelevel import StorageLevel rdd = rdd.cache() result_rdd = rdd.map( lambda x: x) # sparkxshards will uncache the rdd when gc spark_xshards = SparkXShards(result_rdd) return spark_xshards
def _read_as_xshards(path): rdd, schema = ParquetDataset._read_as_dict_rdd(path) def merge_records(schema, iter): l = list(iter) result = {} for k in schema.keys(): result[k] = [] for i, rec in enumerate(l): for k in schema.keys(): result[k].append(rec[k]) for k, v in schema.items(): if not v.feature_type == FeatureType.IMAGE: result[k] = np.stack(result[k]) return [result] result_rdd = rdd.mapPartitions( lambda iter: merge_records(schema, iter)) xshards = SparkXShards(result_rdd) return xshards
def read_parquet(file_path, columns=None, schema=None, **options): """ Read parquet files to SparkXShards of pandas DataFrames. :param file_path: Parquet file path, a list of multiple parquet file paths, or a directory containing parquet files. Local file system, HDFS, and AWS S3 are supported. :param columns: list of column name, default=None. If not None, only these columns will be read from the file. :param schema: pyspark.sql.types.StructType for the input schema or a DDL-formatted string (For example col0 INT, col1 DOUBLE). :param options: other options for reading parquet. :return: An instance of SparkXShards. """ sc = init_nncontext() spark = OrcaContext.get_spark_session() # df = spark.read.parquet(file_path) df = spark.read.load(file_path, "parquet", schema=schema, **options) if columns: df = df.select(*columns) def to_pandas(columns): def f(iter): import pandas as pd data = list(iter) pd_df = pd.DataFrame(data, columns=columns) return [pd_df] return f pd_rdd = df.rdd.mapPartitions(to_pandas(df.columns)) try: data_shards = SparkXShards(pd_rdd) except Exception as e: print("An error occurred when reading parquet files") raise e return data_shards
def predict(self, data, feature_cols=None, batch_size=4): """ Predict input data :param batch_size: Int. Set batch Size, default is 4. :param data: data to be predicted. XShards, Spark DataFrame, numpy array and list of numpy arrays are supported. If data is XShards, each partition is a dictionary of {'x': feature}, where feature(label) is a numpy array or a list of numpy arrays. :param feature_cols: Feature column name(s) of data. Only used when data is a Spark DataFrame. Default: None. :return: predicted result. If the input data is XShards, the predict result is a XShards, each partition of the XShards is a dictionary of {'prediction': result}, where the result is a numpy array or a list of numpy arrays. If the input data is numpy arrays or list of numpy arrays, the predict result is a numpy array or a list of numpy arrays. """ sc = init_nncontext() model_bytes_broadcast = sc.broadcast(self.model_bytes) weight_bytes_broadcast = sc.broadcast(self.weight_bytes) def partition_inference(partition): model_bytes = model_bytes_broadcast.value weight_bytes = weight_bytes_broadcast.value partition = list(partition) data_num = len(partition) ie = IECore() config = {'CPU_THREADS_NUM': str(self.core_num)} ie.set_config(config, 'CPU') net = ie.read_network(model=model_bytes, weights=weight_bytes, init_from_buffer=True) net.batch_size = batch_size local_model = ie.load_network(network=net, device_name="CPU", num_requests=data_num) inputs = list(iter(local_model.requests[0].input_blobs)) outputs = list(iter(local_model.requests[0].output_blobs)) assert len( outputs) != 0, "The number of model outputs should not be 0." def add_elem(d): d_len = len(d) if d_len < batch_size: rep_time = [1] * (d_len - 1) rep_time.append(batch_size - d_len + 1) return np.repeat(d, rep_time, axis=0), d_len else: return d, d_len results = [] for idx, batch_data in enumerate(partition): infer_request = local_model.requests[idx] input_dict = dict() elem_num = 0 if isinstance(batch_data, list): for i, input in enumerate(inputs): input_dict[input], elem_num = add_elem(batch_data[i]) else: input_dict[inputs[0]], elem_num = add_elem(batch_data) infer_request.infer(input_dict) if len(outputs) == 1: results.append(infer_request.output_blobs[ outputs[0]].buffer[:elem_num]) else: results.append( list( map( lambda output: infer_request.output_blobs[ output].buffer[:elem_num], outputs))) return results def predict_transform(dict_data, batch_size): assert isinstance(dict_data, dict), "each shard should be an dict" assert "x" in dict_data, "key x should in each shard" feature_data = dict_data["x"] if isinstance(feature_data, np.ndarray): assert feature_data.shape[0] <= batch_size, \ "The batch size of input data (the second dim) should be less than the model " \ "batch size, otherwise some inputs will be ignored." elif isinstance(feature_data, list): for elem in feature_data: assert isinstance(elem, np.ndarray), "Each element in the x list should be " \ "a ndarray, but get " + \ elem.__class__.__name__ assert elem.shape[0] <= batch_size, "The batch size of each input data (the " \ "second dim) should be less than the " \ "model batch size, otherwise some inputs " \ "will be ignored." else: raise ValueError( "x in each shard should be a ndarray or a list of ndarray." ) return feature_data if isinstance(data, DataFrame): from bigdl.orca.learn.utils import dataframe_to_xshards, convert_predict_rdd_to_dataframe xshards, _ = dataframe_to_xshards(data, validation_data=None, feature_cols=feature_cols, label_cols=None, mode="predict") transformed_data = xshards.transform_shard(predict_transform, batch_size) result_rdd = transformed_data.rdd.mapPartitions( lambda iter: partition_inference(iter)) return convert_predict_rdd_to_dataframe( data, result_rdd.flatMap(lambda data: data)) elif isinstance(data, SparkXShards): transformed_data = data.transform_shard(predict_transform, batch_size) result_rdd = transformed_data.rdd.mapPartitions( lambda iter: partition_inference(iter)) def update_result_shard(data): shard, y = data shard["prediction"] = y return shard return SparkXShards( data.rdd.zip(result_rdd).map(update_result_shard)) elif isinstance(data, (np.ndarray, list)): if isinstance(data, np.ndarray): split_num = math.ceil(len(data) / batch_size) arrays = np.array_split(data, split_num) num_slices = min(split_num, self.node_num) data_rdd = sc.parallelize(arrays, numSlices=num_slices) elif isinstance(data, list): flattened = nest.flatten(data) data_length = len(flattened[0]) data_to_be_rdd = [] split_num = math.ceil(flattened[0].shape[0] / batch_size) num_slices = min(split_num, self.node_num) for i in range(split_num): data_to_be_rdd.append([]) for x in flattened: assert isinstance(x, np.ndarray), "the data in the data list should be " \ "ndarrays, but get " + \ x.__class__.__name__ assert len(x) == data_length, \ "the ndarrays in data must all have the same size in first dimension" \ ", got first ndarray of size {} and another {}".format(data_length, len(x)) x_parts = np.array_split(x, split_num) for idx, x_part in enumerate(x_parts): data_to_be_rdd[idx].append(x_part) data_to_be_rdd = [ nest.pack_sequence_as(data, shard) for shard in data_to_be_rdd ] data_rdd = sc.parallelize(data_to_be_rdd, numSlices=num_slices) print("Partition number: ", data_rdd.getNumPartitions()) result_rdd = data_rdd.mapPartitions( lambda iter: partition_inference(iter)) result_arr_list = result_rdd.collect() result_arr = None if isinstance(result_arr_list[0], list): result_arr = [ np.concatenate([r[i] for r in result_arr_list], axis=0) for i in range(len(result_arr_list[0])) ] elif isinstance(result_arr_list[0], np.ndarray): result_arr = np.concatenate(result_arr_list, axis=0) return result_arr else: raise ValueError( "Only XShards, Spark DataFrame, a numpy array and a list of numpy arr" "ays are supported as input data, but get " + data.__class__.__name__)
def test_nnEstimator(self): from bigdl.dllib.nnframes import NNModel linear_model = Sequential().add(Linear(2, 2)) mse_criterion = MSECriterion() df, _ = self.get_estimator_df() est = Estimator.from_bigdl(model=linear_model, loss=mse_criterion, optimizer=Adam(), feature_preprocessing=SeqToTensor([2]), label_preprocessing=SeqToTensor([2])) res0 = est.predict(df) res0_c = res0.collect() est.fit(df, 2, batch_size=4) nn_model = NNModel(est.get_model(), feature_preprocessing=SeqToTensor([2])) res1 = nn_model.transform(df) res2 = est.predict(df) res1_c = res1.collect() res2_c = res2.collect() assert type(res1).__name__ == 'DataFrame' assert type(res2).__name__ == 'DataFrame' assert len(res1_c) == len(res2_c) for idx in range(len(res1_c)): assert res1_c[idx]["prediction"] == res2_c[idx]["prediction"] with tempfile.TemporaryDirectory() as tempdirname: temp_path = os.path.join(tempdirname, "model") est.save(temp_path) est2 = Estimator.from_bigdl(model=linear_model, loss=mse_criterion) est2.load(temp_path, optimizer=Adam(), loss=mse_criterion, feature_preprocessing=SeqToTensor([2]), label_preprocessing=SeqToTensor([2])) est2.set_constant_gradient_clipping(0.1, 1.2) est2.clear_gradient_clipping() res3 = est2.predict(df) res3_c = res3.collect() assert type(res3).__name__ == 'DataFrame' assert len(res1_c) == len(res3_c) for idx in range(len(res1_c)): assert res1_c[idx]["prediction"] == res3_c[idx]["prediction"] est2.fit(df, 4, batch_size=4) data = self.sc.parallelize([((2.0, 1.0), (1.0, 2.0)), ((1.0, 2.0), (2.0, 1.0)), ((2.0, 1.0), (1.0, 2.0)), ((1.0, 2.0), (2.0, 1.0))]) data_shard = SparkXShards(data) data_shard = data_shard.transform_shard( lambda feature_label_tuple: { "x": np.stack([ np.expand_dims(np.array(feature_label_tuple[0][0]), axis=0 ), np.expand_dims(np.array(feature_label_tuple[0][1]), axis=0) ], axis=1), "y": np.stack([ np.expand_dims(np.array(feature_label_tuple[1][0]), axis=0 ), np.expand_dims(np.array(feature_label_tuple[1][1]), axis=0) ], axis=1) }) res4 = est.predict(data_shard) res4_c = res4.collect() assert type(res4).__name__ == 'SparkXShards' for idx in range(len(res4_c)): assert abs(res4_c[idx]["prediction"][0][0] - res3_c[idx]["prediction"][0]) == 0 assert abs(res4_c[idx]["prediction"][0][1] - res3_c[idx]["prediction"][1]) == 0 est.fit(data_shard, 1, batch_size=4) res5 = est.predict(data_shard) res5_c = res5.collect() res6 = est.predict(df) res6_c = res6.collect() for idx in range(len(res5_c)): assert abs(res5_c[idx]["prediction"][0][0] - res6_c[idx]["prediction"][0]) == 0 assert abs(res5_c[idx]["prediction"][0][1] - res6_c[idx]["prediction"][1]) == 0
def read_file_spark(file_path, file_type, **kwargs): sc = init_nncontext() node_num, core_num = get_node_and_core_number() backend = OrcaContext.pandas_read_backend if backend == "pandas": file_url_splits = file_path.split("://") prefix = file_url_splits[0] file_paths = [] if isinstance(file_path, list): [ file_paths.extend(extract_one_path(path, os.environ)) for path in file_path ] else: file_paths = extract_one_path(file_path, os.environ) if not file_paths: raise Exception( "The file path is invalid or empty, please check your data") num_files = len(file_paths) total_cores = node_num * core_num num_partitions = num_files if num_files < total_cores else total_cores rdd = sc.parallelize(file_paths, num_partitions) if prefix == "hdfs": pd_rdd = rdd.mapPartitions( lambda iter: read_pd_hdfs_file_list(iter, file_type, **kwargs)) elif prefix == "s3": pd_rdd = rdd.mapPartitions( lambda iter: read_pd_s3_file_list(iter, file_type, **kwargs)) else: def loadFile(iterator): dfs = [] for x in iterator: df = read_pd_file(x, file_type, **kwargs) dfs.append(df) import pandas as pd return [pd.concat(dfs)] pd_rdd = rdd.mapPartitions(loadFile) else: # Spark backend; spark.read.csv/json accepts a folder path as input assert file_type == "json" or file_type == "csv", \ "Unsupported file type: %s. Only csv and json files are supported for now" % file_type spark = OrcaContext.get_spark_session() # TODO: add S3 confidentials # The following implementation is adapted from # https://github.com/databricks/koalas/blob/master/databricks/koalas/namespace.py # with some modifications. if "mangle_dupe_cols" in kwargs: assert kwargs[ "mangle_dupe_cols"], "mangle_dupe_cols can only be True" kwargs.pop("mangle_dupe_cols") if "parse_dates" in kwargs: assert not kwargs["parse_dates"], "parse_dates can only be False" kwargs.pop("parse_dates") names = kwargs.get("names", None) if "names" in kwargs: kwargs.pop("names") usecols = kwargs.get("usecols", None) if "usecols" in kwargs: kwargs.pop("usecols") dtype = kwargs.get("dtype", None) if "dtype" in kwargs: kwargs.pop("dtype") squeeze = kwargs.get("squeeze", False) if "squeeze" in kwargs: kwargs.pop("squeeze") index_col = kwargs.get("index_col", None) if "index_col" in kwargs: kwargs.pop("index_col") if file_type == "csv": # Handle pandas-compatible keyword arguments kwargs["inferSchema"] = True header = kwargs.get("header", "infer") if isinstance(names, str): kwargs["schema"] = names if header == "infer": header = 0 if names is None else None if header == 0: kwargs["header"] = True elif header is None: kwargs["header"] = False else: raise ValueError("Unknown header argument {}".format(header)) if "quotechar" in kwargs: quotechar = kwargs["quotechar"] kwargs.pop("quotechar") kwargs["quote"] = quotechar if "escapechar" in kwargs: escapechar = kwargs["escapechar"] kwargs.pop("escapechar") kwargs["escape"] = escapechar # sep and comment are the same as pandas if "comment" in kwargs: comment = kwargs["comment"] if not isinstance(comment, str) or len(comment) != 1: raise ValueError( "Only length-1 comment characters supported") df = spark.read.csv(file_path, **kwargs) if header is None: df = df.selectExpr(*[ "`%s` as `%s`" % (field.name, i) for i, field in enumerate(df.schema) ]) else: df = spark.read.json(file_path, **kwargs) # Handle pandas-compatible postprocessing arguments if usecols is not None and not callable(usecols): usecols = list(usecols) renamed = False if isinstance(names, list): if len(set(names)) != len(names): raise ValueError( "Found duplicate names, please check your names input") if usecols is not None: if not callable(usecols): # usecols is list if len(names) != len(usecols) and len(names) != len( df.schema): raise ValueError("Passed names did not match usecols") if len(names) == len(df.schema): df = df.selectExpr(*[ "`%s` as `%s`" % (field.name, name) for field, name in zip(df.schema, names) ]) renamed = True else: if len(names) != len(df.schema): raise ValueError( "The number of names [%s] does not match the number " "of columns [%d]. Try names by a Spark SQL DDL-formatted " "string." % (len(names), len(df.schema))) df = df.selectExpr(*[ "`%s` as `%s`" % (field.name, name) for field, name in zip(df.schema, names) ]) renamed = True index_map = dict([(i, field.name) for i, field in enumerate(df.schema)]) if usecols is not None: if callable(usecols): cols = [ field.name for field in df.schema if usecols(field.name) ] missing = [] elif all(isinstance(col, int) for col in usecols): cols = [ field.name for i, field in enumerate(df.schema) if i in usecols ] missing = [ col for col in usecols if col >= len(df.schema) or df.schema[col].name not in cols ] elif all(isinstance(col, str) for col in usecols): cols = [ field.name for field in df.schema if field.name in usecols ] if isinstance(names, list): missing = [c for c in usecols if c not in names] else: missing = [col for col in usecols if col not in cols] else: raise ValueError( "usecols must only be list-like of all strings, " "all unicode, all integers or a callable.") if len(missing) > 0: raise ValueError( "usecols do not match columns, columns expected but not found: %s" % missing) if len(cols) > 0: df = df.select(cols) if isinstance(names, list): if not renamed: df = df.selectExpr(*[ "`%s` as `%s`" % (col, name) for col, name in zip(cols, names) ]) # update index map after rename for index, col in index_map.items(): if col in cols: index_map[index] = names[cols.index(col)] if df.rdd.getNumPartitions() < node_num: df = df.repartition(node_num) def to_pandas(columns, squeeze=False, index_col=None): def f(iter): import pandas as pd data = list(iter) pd_df = pd.DataFrame(data, columns=columns) if dtype is not None: if isinstance(dtype, dict): for col, type in dtype.items(): if isinstance(col, str): if col not in pd_df.columns: raise ValueError( "column to be set type is not" " in current dataframe") pd_df[col] = pd_df[col].astype(type) elif isinstance(col, int): if index_map[col] not in pd_df.columns: raise ValueError( "column index to be set type is not" " in current dataframe") pd_df[index_map[col]] = pd_df[ index_map[col]].astype(type) else: pd_df = pd_df.astype(dtype) if squeeze and len(pd_df.columns) == 1: pd_df = pd_df.iloc[:, 0] if index_col: pd_df = pd_df.set_index(index_col) return [pd_df] return f pd_rdd = df.rdd.mapPartitions(to_pandas(df.columns, squeeze, index_col)) try: data_shards = SparkXShards(pd_rdd) except Exception as e: alternative_backend = "pandas" if backend == "spark" else "spark" print( "An error occurred when reading files with '%s' backend, you may switch to '%s' " "backend for another try. You can set the backend using " "OrcaContext.pandas_read_backend" % (backend, alternative_backend)) raise e return data_shards