Beispiel #1
0
    def test_pandas_dataframe(self):

        OrcaContext.pandas_read_backend = "pandas"
        file_path = os.path.join(resource_path, "orca/learn/ncf.csv")
        data_shard = read_csv(file_path,
                              usecols=[0, 1, 2],
                              dtype={
                                  0: np.float32,
                                  1: np.float32,
                                  2: np.float32
                              })

        estimator = get_estimator(model_fn=lambda config: SimpleModel())
        estimator.fit(data_shard,
                      batch_size=2,
                      epochs=2,
                      feature_cols=["user", "item"],
                      label_cols=["label"])

        estimator.evaluate(data_shard,
                           batch_size=2,
                           feature_cols=["user", "item"],
                           label_cols=["label"])
        result = estimator.predict(data_shard,
                                   batch_size=2,
                                   feature_cols=["user", "item"])
        result.collect()
Beispiel #2
0
    def test_bigdl_pytorch_estimator_pandas_dataframe(self):
        class SimpleModel(nn.Module):
            def __init__(self):
                super(SimpleModel, self).__init__()
                self.fc = nn.Linear(1, 10)

            def forward(self, x):
                x = torch.unsqueeze(x, dim=1)
                x = self.fc(x)
                return F.log_softmax(x, dim=1)

        def loss_func(input, target):
            return nn.CrossEntropyLoss().forward(input,
                                                 target.flatten().long())

        model = SimpleModel()

        OrcaContext.pandas_read_backend = "pandas"
        file_path = os.path.join(resource_path,
                                 "orca/learn/simple_feature_label.csv")
        data_shard = read_csv(file_path)

        with tempfile.TemporaryDirectory() as temp_dir_name:
            estimator = Estimator.from_torch(
                model=model,
                loss=loss_func,
                metrics=[Accuracy()],
                optimizer=SGD(learningrate_schedule=Default()),
                model_dir=temp_dir_name)
            estimator.fit(data=data_shard,
                          epochs=1,
                          batch_size=4,
                          feature_cols=['feature'],
                          label_cols=['label'],
                          validation_data=data_shard,
                          checkpoint_trigger=EveryEpoch())
            estimator.evaluate(data_shard,
                               batch_size=4,
                               feature_cols=['feature'],
                               label_cols=['label'])
            est2 = Estimator.from_torch(model=model,
                                        loss=loss_func,
                                        metrics=[Accuracy()],
                                        optimizer=None)
            est2.load_orca_checkpoint(temp_dir_name)
            est2.predict(data_shard, batch_size=4, feature_cols=['feature'])
Beispiel #3
0
    def test_xshards_spark_estimator_multi_inputs(self):
        resource_path = os.path.join(
            os.path.split(__file__)[0], "../../resources")

        def transform(df):
            result = {
                "x": [
                    np.expand_dims(df['user'].to_numpy(), axis=1),
                    np.expand_dims(df['item'].to_numpy(), axis=1)
                ],
                "y":
                df['label'].to_numpy()
            }
            return result

        file_path = os.path.join(resource_path, "orca/learn/ncf2.csv")
        data_shard = read_csv(file_path)
        data_shard = data_shard.transform_shard(transform)
        zx1 = ZLayer.Input(shape=(1, ))
        zx2 = ZLayer.Input(shape=(1, ))
        zz = ZLayer.merge([zx1, zx2], mode="concat")
        zy = ZLayer.Dense(2)(zz)
        model = ZModel([zx1, zx2], zy)

        optim_method = SGD(learningrate=0.01)
        with tempfile.TemporaryDirectory() as temp_dir_name:
            estimator = Estimator.from_bigdl(model=model,
                                             optimizer=optim_method,
                                             loss=ClassNLLCriterion(),
                                             metrics=[Accuracy()],
                                             model_dir=temp_dir_name)
            estimator.set_constant_gradient_clipping(0.1, 1.2)
            r1 = estimator.predict(data=data_shard)
            r_c = r1.collect()
            estimator.set_tensorboard(log_dir=temp_dir_name, app_name="test")
            estimator.fit(data=data_shard,
                          epochs=5,
                          batch_size=8,
                          validation_data=data_shard,
                          checkpoint_trigger=EveryEpoch())
            summary = estimator.get_train_summary(tag="Loss")
            temp_path = os.path.join(temp_dir_name, "save_model")
            estimator.save(temp_path)
            eval_result = estimator.evaluate(data=data_shard, batch_size=8)
    def test_xshardstsdataset_initialization_multiple(self):
        shards_multiple = read_csv(
            os.path.join(self.resource_path, "multiple.csv"))
        # legal input
        tsdata = XShardsTSDataset.from_xshards(
            shards_multiple,
            dt_col="datetime",
            target_col="value",
            extra_feature_col=["extra feature"],
            id_col="id")
        assert tsdata._id_list == [0, 1]
        assert tsdata.feature_col == ["extra feature"]
        assert tsdata.target_col == ["value"]
        assert tsdata.dt_col == "datetime"
        assert tsdata.shards.num_partitions() == 2

        tsdata = XShardsTSDataset.from_xshards(
            shards_multiple,
            dt_col="datetime",
            target_col=["value"],
            extra_feature_col="extra feature",
            id_col="id")
        assert tsdata._id_list == [0, 1]
        assert tsdata.feature_col == ["extra feature"]
        assert tsdata.target_col == ["value"]
        assert tsdata.dt_col == "datetime"
        assert tsdata.shards.num_partitions() == 2

        tsdata = XShardsTSDataset.from_xshards(
            shards_multiple,
            dt_col="datetime",
            target_col=["value"],
            extra_feature_col="extra feature")
        assert tsdata._id_list == ['0']
        assert tsdata.feature_col == ["extra feature"]
        assert tsdata.target_col == ["value"]
        assert tsdata.dt_col == "datetime"
        assert tsdata.shards.num_partitions() == 1
    def test_xshardstsdataset_split(self):
        shards_multiple = read_csv(
            os.path.join(self.resource_path, "multiple.csv"))
        # only train and test
        tsdata_train, tsdata_valid, tsdata_test =\
            XShardsTSDataset.from_xshards(shards_multiple, dt_col="datetime", target_col="value",
                                          extra_feature_col=["extra feature"], id_col="id",
                                          with_split=True, val_ratio=0, test_ratio=0.1)
        # standard split with all three sets
        tsdata_train, tsdata_valid, tsdata_test =\
            XShardsTSDataset.from_xshards(shards_multiple, dt_col="datetime", target_col="value",
                                          extra_feature_col=["extra feature"], id_col="id",
                                          with_split=True, val_ratio=0.1, test_ratio=0.1,
                                          largest_look_back=5, largest_horizon=2)

        tsdata_train.feature_col.append("new extra feature")
        assert len(tsdata_train.feature_col) == 2
        assert len(tsdata_valid.feature_col) == 1
        assert len(tsdata_test.feature_col) == 1

        tsdata_train.target_col[0] = "new value"
        assert tsdata_train.target_col[0] == "new value"
        assert tsdata_valid.target_col[0] != "new value"
        assert tsdata_test.target_col[0] != "new value"
Beispiel #6
0
    def test_xshards_spark_estimator(self):
        resource_path = os.path.join(
            os.path.split(__file__)[0], "../../resources")

        def transform(df):
            result = {
                "x":
                np.stack([df['user'].to_numpy(), df['item'].to_numpy()],
                         axis=1),
                "y":
                df['label'].to_numpy()
            }
            return result

        file_path = os.path.join(resource_path, "orca/learn/ncf2.csv")
        data_shard = read_csv(file_path)
        data_shard = data_shard.transform_shard(transform)
        model = Sequential()
        model.add(Linear(2, 2))
        model.add(LogSoftMax())
        optim_method = SGD(learningrate=0.01)
        with tempfile.TemporaryDirectory() as temp_dir_name:
            estimator = Estimator.from_bigdl(
                model=model,
                optimizer=optim_method,
                loss=ClassNLLCriterion(),
                metrics=Accuracy(),
                model_dir=temp_dir_name,
                feature_preprocessing=SeqToTensor([2]),
                label_preprocessing=SeqToTensor([1]))
            estimator.set_constant_gradient_clipping(0.1, 1.2)
            r1 = estimator.predict(data=data_shard)
            r_c = r1.collect()
            estimator.set_tensorboard(log_dir=temp_dir_name, app_name="test")
            estimator.fit(data=data_shard,
                          epochs=5,
                          batch_size=8,
                          validation_data=data_shard,
                          checkpoint_trigger=EveryEpoch())
            summary = estimator.get_train_summary(tag="Loss")
            temp_path = os.path.join(temp_dir_name, "save_model")
            estimator.save(temp_path)
            with self.assertRaises(Exception) as context:
                Estimator.from_bigdl(model=model,
                                     optimizer=optim_method,
                                     loss=ClassNLLCriterion(),
                                     metrics=['accuracy'],
                                     model_dir=temp_dir_name,
                                     feature_preprocessing=SeqToTensor([2]),
                                     label_preprocessing=SeqToTensor([1]))
            self.assertTrue('Only orca metrics are supported, but get str' in
                            str(context.exception))
            eval_result = estimator.evaluate(data=data_shard, batch_size=8)
            assert isinstance(eval_result, dict)
            result = estimator.predict(data=data_shard)
            assert type(result).__name__ == 'SparkXShards'
            result_c = result.collect()
            df = self.get_estimator_df2()
            r0 = estimator.predict(df)
            r0_c = r0.collect()
            assert type(r0).__name__ == 'DataFrame'
            for idx in range(len(r0_c)):
                assert abs(r0_c[idx]["prediction"][0] -
                           result_c[0]["prediction"][idx][0]) <= 1e-06
                assert abs(r0_c[idx]["prediction"][1] -
                           result_c[0]["prediction"][idx][1]) <= 1e-06
            estimator.fit(data=df,
                          epochs=6,
                          batch_size=8,
                          validation_data=df,
                          validation_trigger=EveryEpoch())
            summary = estimator.get_train_summary("Loss")

            # test load from checkpoint
            est2 = Estimator.from_bigdl(model=Sequential(),
                                        optimizer=None,
                                        loss=None,
                                        metrics=[Accuracy()],
                                        model_dir=None)
            est2.load(temp_dir_name,
                      loss=ClassNLLCriterion(),
                      is_checkpoint=True)
            r2 = est2.predict(data=data_shard)
            r2_c = r2.collect()
            assert (result_c[0]["prediction"] == r2_c[0]["prediction"]).all()
            # resume training
            est2.fit(data=data_shard,
                     epochs=10,
                     batch_size=8,
                     validation_data=data_shard,
                     checkpoint_trigger=EveryEpoch())
            est2.evaluate(data=data_shard, batch_size=8)
            # test load from saved model
            est3 = Estimator.from_bigdl(model=Sequential(),
                                        optimizer=None,
                                        loss=None,
                                        model_dir=None)
            est3.load(temp_path,
                      optimizer=optim_method,
                      loss=ClassNLLCriterion())
            r3 = est3.predict(data=data_shard)
            r3_c = r3.collect()
            assert (r3_c[0]["prediction"] == r2_c[0]["prediction"]).all()
    def test_xshardstsdataset_roll_multiple_id(self):
        shards_multiple = read_csv(
            os.path.join(self.resource_path, "multiple.csv"))
        horizon = random.randint(1, 10)
        lookback = random.randint(1, 20)

        tsdata = XShardsTSDataset.from_xshards(
            shards_multiple,
            dt_col="datetime",
            target_col="value",
            extra_feature_col=["extra feature"],
            id_col="id")

        with pytest.raises(RuntimeError):
            tsdata.to_xshards()

        # roll train
        tsdata.roll(lookback=lookback, horizon=horizon)
        shards_numpy = tsdata.to_xshards()
        collected_numpy = shards_numpy.collect()  # collect and valid
        x = np.concatenate(
            [collected_numpy[i]['x'] for i in range(len(collected_numpy))],
            axis=0)
        y = np.concatenate(
            [collected_numpy[i]['y'] for i in range(len(collected_numpy))],
            axis=0)
        assert x.shape == ((50 - lookback - horizon + 1) * 2, lookback, 2)
        assert y.shape == ((50 - lookback - horizon + 1) * 2, horizon, 1)

        tsdata.roll(lookback=lookback,
                    horizon=horizon,
                    feature_col=["extra feature"],
                    target_col="value")
        shards_numpy = tsdata.to_xshards()
        collected_numpy = shards_numpy.collect()  # collect and valid
        x = np.concatenate(
            [collected_numpy[i]['x'] for i in range(len(collected_numpy))],
            axis=0)
        y = np.concatenate(
            [collected_numpy[i]['y'] for i in range(len(collected_numpy))],
            axis=0)
        assert x.shape == ((50 - lookback - horizon + 1) * 2, lookback, 2)
        assert y.shape == ((50 - lookback - horizon + 1) * 2, horizon, 1)

        tsdata.roll(lookback=lookback,
                    horizon=horizon,
                    feature_col=[],
                    target_col="value")
        shards_numpy = tsdata.to_xshards()
        collected_numpy = shards_numpy.collect()  # collect and valid
        x = np.concatenate(
            [collected_numpy[i]['x'] for i in range(len(collected_numpy))],
            axis=0)
        y = np.concatenate(
            [collected_numpy[i]['y'] for i in range(len(collected_numpy))],
            axis=0)
        assert x.shape == ((50 - lookback - horizon + 1) * 2, lookback, 1)
        assert y.shape == ((50 - lookback - horizon + 1) * 2, horizon, 1)

        # roll test
        horizon = 0
        lookback = random.randint(1, 20)

        tsdata.roll(lookback=lookback, horizon=horizon)
        shards_numpy = tsdata.to_xshards()
        collected_numpy = shards_numpy.collect()  # collect and valid
        x = np.concatenate(
            [collected_numpy[i]['x'] for i in range(len(collected_numpy))],
            axis=0)
        assert x.shape == ((50 - lookback - horizon + 1) * 2, lookback, 2)
Beispiel #8
0
    def test_bigdl_pytorch_estimator_shard(self):
        class SimpleModel(nn.Module):
            def __init__(self):
                super(SimpleModel, self).__init__()
                self.fc = nn.Linear(2, 2)

            def forward(self, x):
                x = self.fc(x)
                return F.log_softmax(x, dim=1)

        model = SimpleModel()

        def loss_func(input, target):
            return nn.CrossEntropyLoss().forward(input,
                                                 target.flatten().long())

        def transform(df):
            result = {
                "x":
                np.stack([df['user'].to_numpy(), df['item'].to_numpy()],
                         axis=1),
                "y":
                df['label'].to_numpy()
            }
            return result

        def transform_del_y(d):
            result = {"x": d["x"]}
            return result

        OrcaContext.pandas_read_backend = "pandas"
        file_path = os.path.join(resource_path, "orca/learn/ncf.csv")
        data_shard = read_csv(file_path)
        data_shard = data_shard.transform_shard(transform)

        with tempfile.TemporaryDirectory() as temp_dir_name:
            estimator = Estimator.from_torch(
                model=model,
                loss=loss_func,
                metrics=[Accuracy()],
                optimizer=SGD(learningrate_schedule=Default()),
                model_dir=temp_dir_name)
            estimator.fit(data=data_shard,
                          epochs=4,
                          batch_size=2,
                          validation_data=data_shard,
                          checkpoint_trigger=EveryEpoch())
            state_dict1 = estimator.get_model().state_dict()

            estimator.evaluate(data_shard, batch_size=2)
            est2 = Estimator.from_torch(model=model,
                                        loss=loss_func,
                                        metrics=[Accuracy()],
                                        optimizer=None)
            est2.load_orca_checkpoint(temp_dir_name)
            state_dict2 = est2.get_model().state_dict()

            for name in state_dict1:
                para1 = state_dict1[name]
                para2 = state_dict2[name]
                assert torch.all(torch.eq(para1, para2)), "After reloading the model, " \
                                                          "%r does not match" % name

            est2.fit(data=data_shard,
                     epochs=8,
                     batch_size=2,
                     validation_data=data_shard,
                     checkpoint_trigger=EveryEpoch())
            est2.evaluate(data_shard, batch_size=2)
            pred_result = est2.predict(data_shard)
            pred_c = pred_result.collect()
            assert (pred_result, SparkXShards)
            pred_shard = data_shard.transform_shard(transform_del_y)
            pred_result2 = est2.predict(pred_shard)
            pred_c_2 = pred_result2.collect()
            assert (pred_c[0]["prediction"] == pred_c_2[0]["prediction"]).all()