def test_bigdl_pytorch_estimator_shard(self): class SimpleModel(nn.Module): def __init__(self): super(SimpleModel, self).__init__() self.fc = nn.Linear(2, 2) def forward(self, x): x = self.fc(x) return F.log_softmax(x, dim=1) model = SimpleModel() def loss_func(input, target): return nn.CrossEntropyLoss().forward(input, target.flatten().long()) def transform(df): result = { "x": [df['user'].to_numpy(), df['item'].to_numpy()], "y": df['label'].to_numpy() } return result OrcaContext.pandas_read_backend = "pandas" file_path = os.path.join(resource_path, "orca/learn/ncf.csv") data_shard = read_csv(file_path) data_shard = data_shard.transform_shard(transform) estimator = Estimator.from_torch(model=model, loss=loss_func, optimizer=SGD(), backend="bigdl") estimator.fit(data=data_shard, epochs=4, batch_size=2, validation_data=data_shard, validation_methods=[Accuracy()], checkpoint_trigger=EveryEpoch()) estimator.evaluate(data_shard, validation_methods=[Accuracy()], batch_size=2)
def test_pandas_dataframe(self): OrcaContext.pandas_read_backend = "pandas" file_path = os.path.join(resource_path, "orca/learn/ncf.csv") data_shard = read_csv(file_path, usecols=[0, 1, 2], dtype={ 0: np.float32, 1: np.float32, 2: np.float32 }) estimator = get_estimator(model_fn=lambda config: SimpleModel()) estimator.fit(data_shard, batch_size=2, epochs=2, feature_cols=["user", "item"], label_cols=["label"]) estimator.evaluate(data_shard, batch_size=2, feature_cols=["user", "item"], label_cols=["label"]) result = estimator.predict(data_shard, batch_size=2, feature_cols=["user", "item"]) result.collect()
def test_bigdl_pytorch_estimator_shard(self): class SimpleModel(nn.Module): def __init__(self): super(SimpleModel, self).__init__() self.fc = nn.Linear(2, 2) def forward(self, x): x = self.fc(x) return F.log_softmax(x, dim=1) model = SimpleModel() def loss_func(input, target): return nn.CrossEntropyLoss().forward(input, target.flatten().long()) def transform(df): result = { "x": np.stack([df['user'].to_numpy(), df['item'].to_numpy()], axis=1), "y": df['label'].to_numpy() } return result def transform_del_y(d): result = {"x": d["x"]} return result OrcaContext.pandas_read_backend = "pandas" file_path = os.path.join(resource_path, "orca/learn/ncf.csv") data_shard = read_csv(file_path) data_shard = data_shard.transform_shard(transform) with tempfile.TemporaryDirectory() as temp_dir_name: estimator = Estimator.from_torch(model=model, loss=loss_func, metrics=[Accuracy()], optimizer=SGD(learningrate_schedule=Default()), model_dir=temp_dir_name) estimator.fit(data=data_shard, epochs=4, batch_size=2, validation_data=data_shard, checkpoint_trigger=EveryEpoch()) estimator.evaluate(data_shard, batch_size=2) est2 = Estimator.from_torch(model=model, loss=loss_func, metrics=[Accuracy()], optimizer=None) est2.load(temp_dir_name, loss=loss_func) est2.fit(data=data_shard, epochs=8, batch_size=2, validation_data=data_shard, checkpoint_trigger=EveryEpoch()) est2.evaluate(data_shard, batch_size=2) pred_result = est2.predict(data_shard) pred_c = pred_result.collect() assert(pred_result, SparkXShards) pred_shard = data_shard.transform_shard(transform_del_y) pred_result2 = est2.predict(pred_shard) pred_c_2 = pred_result2.collect() assert (pred_c[0]["prediction"] == pred_c_2[0]["prediction"]).all()
def test_bigdl_pytorch_estimator_pandas_dataframe(self): class SimpleModel(nn.Module): def __init__(self): super(SimpleModel, self).__init__() self.fc = nn.Linear(1, 10) def forward(self, x): x = torch.unsqueeze(x, dim=1) x = self.fc(x) return F.log_softmax(x, dim=1) def loss_func(input, target): return nn.CrossEntropyLoss().forward(input, target.flatten().long()) model = SimpleModel() OrcaContext.pandas_read_backend = "pandas" file_path = os.path.join(resource_path, "orca/learn/simple_feature_label.csv") data_shard = read_csv(file_path) with tempfile.TemporaryDirectory() as temp_dir_name: estimator = Estimator.from_torch( model=model, loss=loss_func, metrics=[Accuracy()], optimizer=SGD(learningrate_schedule=Default()), model_dir=temp_dir_name) estimator.fit(data=data_shard, epochs=1, batch_size=4, feature_cols=['feature'], label_cols=['label'], validation_data=data_shard, checkpoint_trigger=EveryEpoch()) estimator.evaluate(data_shard, batch_size=4, feature_cols=['feature'], label_cols=['label']) est2 = Estimator.from_torch(model=model, loss=loss_func, metrics=[Accuracy()], optimizer=None) est2.load_orca_checkpoint(temp_dir_name) est2.predict(data_shard, batch_size=4, feature_cols=['feature'])
def test_xshards_spark_estimator_multi_inputs(self): resource_path = os.path.join( os.path.split(__file__)[0], "../../../resources") def transform(df): result = { "x": [ np.expand_dims(df['user'].to_numpy(), axis=1), np.expand_dims(df['item'].to_numpy(), axis=1) ], "y": df['label'].to_numpy() } return result file_path = os.path.join(resource_path, "orca/learn/ncf2.csv") data_shard = read_csv(file_path) data_shard = data_shard.transform_shard(transform) zx1 = ZLayer.Input(shape=(1, )) zx2 = ZLayer.Input(shape=(1, )) zz = ZLayer.merge([zx1, zx2], mode="concat") zy = ZLayer.Dense(2)(zz) model = ZModel([zx1, zx2], zy) optim_method = SGD(learningrate=0.01) with tempfile.TemporaryDirectory() as temp_dir_name: estimator = Estimator.from_bigdl(model=model, optimizer=optim_method, loss=ClassNLLCriterion(), metrics=[Accuracy()], model_dir=temp_dir_name) estimator.set_constant_gradient_clipping(0.1, 1.2) r1 = estimator.predict(data=data_shard) r_c = r1.collect() estimator.set_tensorboard(log_dir=temp_dir_name, app_name="test") estimator.fit(data=data_shard, epochs=5, batch_size=8, validation_data=data_shard, checkpoint_trigger=EveryEpoch()) summary = estimator.get_train_summary(tag="Loss") temp_path = os.path.join(temp_dir_name, "save_model") estimator.save(temp_path) eval_result = estimator.evaluate(data=data_shard, batch_size=8)
def test_xshardstsdataset_initialization_multiple(self): shards_multiple = read_csv( os.path.join(self.resource_path, "chronos/multiple.csv")) # legal input tsdata = XShardsTSDataset.from_xshards( shards_multiple, dt_col="datetime", target_col="value", extra_feature_col=["extra feature"], id_col="id") assert tsdata._id_list == [0, 1] assert tsdata.feature_col == ["extra feature"] assert tsdata.target_col == ["value"] assert tsdata.dt_col == "datetime" assert tsdata.shards.num_partitions() == 2 tsdata = XShardsTSDataset.from_xshards( shards_multiple, dt_col="datetime", target_col=["value"], extra_feature_col="extra feature", id_col="id") assert tsdata._id_list == [0, 1] assert tsdata.feature_col == ["extra feature"] assert tsdata.target_col == ["value"] assert tsdata.dt_col == "datetime" assert tsdata.shards.num_partitions() == 2 tsdata = XShardsTSDataset.from_xshards( shards_multiple, dt_col="datetime", target_col=["value"], extra_feature_col="extra feature") assert tsdata._id_list == ['0'] assert tsdata.feature_col == ["extra feature"] assert tsdata.target_col == ["value"] assert tsdata.dt_col == "datetime" assert tsdata.shards.num_partitions() == 1
def test_xshardstsdataset_split(self): shards_multiple = read_csv( os.path.join(self.resource_path, "chronos/multiple.csv")) # only train and test tsdata_train, tsdata_valid, tsdata_test =\ XShardsTSDataset.from_xshards(shards_multiple, dt_col="datetime", target_col="value", extra_feature_col=["extra feature"], id_col="id", with_split=True, val_ratio=0, test_ratio=0.1) # standard split with all three sets tsdata_train, tsdata_valid, tsdata_test =\ XShardsTSDataset.from_xshards(shards_multiple, dt_col="datetime", target_col="value", extra_feature_col=["extra feature"], id_col="id", with_split=True, val_ratio=0.1, test_ratio=0.1, largest_look_back=5, largest_horizon=2) tsdata_train.feature_col.append("new extra feature") assert len(tsdata_train.feature_col) == 2 assert len(tsdata_valid.feature_col) == 1 assert len(tsdata_test.feature_col) == 1 tsdata_train.target_col[0] = "new value" assert tsdata_train.target_col[0] == "new value" assert tsdata_valid.target_col[0] != "new value" assert tsdata_test.target_col[0] != "new value"
def test_xshards_spark_estimator(self): resource_path = os.path.join( os.path.split(__file__)[0], "../../../resources") def transform(df): result = { "x": [df['user'].to_numpy(), df['item'].to_numpy()], "y": df['label'].to_numpy() } return result file_path = os.path.join(resource_path, "orca/learn/ncf2.csv") data_shard = read_csv(file_path) data_shard = data_shard.transform_shard(transform) model = Sequential() model.add(Linear(2, 2)) model.add(LogSoftMax()) optim_method = SGD(learningrate=0.01) with tempfile.TemporaryDirectory() as temp_dir_name: estimator = Estimator.from_bigdl( model=model, optimizer=optim_method, loss=ClassNLLCriterion(), model_dir=temp_dir_name, feature_preprocessing=SeqToTensor([2]), label_preprocessing=SeqToTensor([1])) estimator.set_constant_gradient_clipping(0.1, 1.2) r1 = estimator.predict(data=data_shard) r_c = r1.collect() estimator.set_tensorboard(log_dir=temp_dir_name, app_name="test") estimator.fit(data=data_shard, epochs=5, batch_size=8, validation_data=data_shard, validation_metrics=[Accuracy()], checkpoint_trigger=EveryEpoch()) summary = estimator.get_train_summary(tag="Loss") temp_path = os.path.join(temp_dir_name, "save_model") estimator.save(temp_path) estimator.evaluate(data=data_shard, validation_metrics=[Accuracy()], batch_size=8) result = estimator.predict(data=data_shard) assert type(result).__name__ == 'SparkXShards' result_c = result.collect() df = self.get_estimator_df2() r0 = estimator.predict(df) r0_c = r0.collect() assert type(r0).__name__ == 'DataFrame' for idx in range(len(r0_c)): assert abs(r0_c[idx]["prediction"][0] - result_c[0]["prediction"][idx][0]) == 0 assert abs(r0_c[idx]["prediction"][1] - result_c[0]["prediction"][idx][1]) == 0 estimator.fit(data=df, epochs=6, batch_size=8, validation_data=df, validation_metrics=[Accuracy()], validation_trigger=EveryEpoch()) summary = estimator.get_train_summary() # test load from checkpoint est2 = Estimator.from_bigdl(model=Sequential(), optimizer=None, loss=None, model_dir=None) est2.load(temp_dir_name, loss=ClassNLLCriterion(), is_checkpoint=True) r2 = est2.predict(data=data_shard) r2_c = r2.collect() assert (result_c[0]["prediction"] == r2_c[0]["prediction"]).all() # resume training est2.fit(data=data_shard, epochs=10, batch_size=8, validation_data=data_shard, validation_metrics=[Accuracy()], checkpoint_trigger=EveryEpoch()) est2.evaluate(data=data_shard, validation_metrics=[Accuracy()], batch_size=8) # test load from saved model est3 = Estimator.from_bigdl(model=Sequential(), optimizer=None, loss=None, model_dir=None) est3.load(temp_path, optimizer=optim_method, loss=ClassNLLCriterion()) r3 = est3.predict(data=data_shard) r3_c = r3.collect() assert (r3_c[0]["prediction"] == r2_c[0]["prediction"]).all()
def test_xshardstsdataset_roll_multiple_id(self): shards_multiple = read_csv( os.path.join(self.resource_path, "chronos/multiple.csv")) horizon = random.randint(1, 10) lookback = random.randint(1, 20) tsdata = XShardsTSDataset.from_xshards( shards_multiple, dt_col="datetime", target_col="value", extra_feature_col=["extra feature"], id_col="id") with pytest.raises(RuntimeError): tsdata.to_xshards() # roll train tsdata.roll(lookback=lookback, horizon=horizon) shards_numpy = tsdata.to_xshards() collected_numpy = shards_numpy.collect() # collect and valid x = np.concatenate( [collected_numpy[i]['x'] for i in range(len(collected_numpy))], axis=0) y = np.concatenate( [collected_numpy[i]['y'] for i in range(len(collected_numpy))], axis=0) assert x.shape == ((50 - lookback - horizon + 1) * 2, lookback, 2) assert y.shape == ((50 - lookback - horizon + 1) * 2, horizon, 1) tsdata.roll(lookback=lookback, horizon=horizon, feature_col=["extra feature"], target_col="value") shards_numpy = tsdata.to_xshards() collected_numpy = shards_numpy.collect() # collect and valid x = np.concatenate( [collected_numpy[i]['x'] for i in range(len(collected_numpy))], axis=0) y = np.concatenate( [collected_numpy[i]['y'] for i in range(len(collected_numpy))], axis=0) assert x.shape == ((50 - lookback - horizon + 1) * 2, lookback, 2) assert y.shape == ((50 - lookback - horizon + 1) * 2, horizon, 1) tsdata.roll(lookback=lookback, horizon=horizon, feature_col=[], target_col="value") shards_numpy = tsdata.to_xshards() collected_numpy = shards_numpy.collect() # collect and valid x = np.concatenate( [collected_numpy[i]['x'] for i in range(len(collected_numpy))], axis=0) y = np.concatenate( [collected_numpy[i]['y'] for i in range(len(collected_numpy))], axis=0) assert x.shape == ((50 - lookback - horizon + 1) * 2, lookback, 1) assert y.shape == ((50 - lookback - horizon + 1) * 2, horizon, 1) # roll test horizon = 0 lookback = random.randint(1, 20) tsdata.roll(lookback=lookback, horizon=horizon) shards_numpy = tsdata.to_xshards() collected_numpy = shards_numpy.collect() # collect and valid x = np.concatenate( [collected_numpy[i]['x'] for i in range(len(collected_numpy))], axis=0) assert x.shape == ((50 - lookback - horizon + 1) * 2, lookback, 2)