Ejemplo n.º 1
0
    def test_bigdl_pytorch_estimator_dataframe_predict(self):
        def loss_func(input, target):
            return nn.CrossEntropyLoss().forward(input, target.flatten().long())

        class IdentityNet(nn.Module):
            def __init__(self):
                super().__init__()
                # need this line to avoid optimizer raise empty variable list
                self.fc1 = nn.Linear(5, 5)

            def forward(self, input_):
                return input_

        model = IdentityNet()
        rdd = self.sc.range(0, 100)
        df = rdd.map(lambda x: ([float(x)] * 5,
                                [int(np.random.randint(0, 2,
                                                       size=()))])).toDF(["feature", "label"])

        with tempfile.TemporaryDirectory() as temp_dir_name:
            estimator = Estimator.from_torch(model=model, loss=loss_func,
                                             optimizer=SGD(learningrate_schedule=Default()),
                                             model_dir=temp_dir_name)
            result = estimator.predict(df, feature_cols=["feature"])
            expr = "sum(cast(feature <> to_array(prediction) as int)) as error"
            assert result.selectExpr(expr).first()["error"] == 0
Ejemplo n.º 2
0
    def test_bigdl_pytorch_estimator_dataframe_fit_evaluate(self):
        class SimpleModel(nn.Module):
            def __init__(self):
                super(SimpleModel, self).__init__()
                self.fc = nn.Linear(5, 5)

            def forward(self, x):
                x = self.fc(x)
                return F.log_softmax(x, dim=1)

        model = SimpleModel()

        def loss_func(input, target):
            return nn.CrossEntropyLoss().forward(input, target.flatten().long())

        rdd = self.sc.range(0, 100)
        df = rdd.map(lambda x: ([float(x)] * 5,
                                [int(np.random.randint(0, 2,
                                                       size=()))])).toDF(["feature", "label"])

        with tempfile.TemporaryDirectory() as temp_dir_name:
            estimator = Estimator.from_torch(model=model, loss=loss_func, metrics=[Accuracy()],
                                             optimizer=SGD(learningrate_schedule=Default()),
                                             model_dir=temp_dir_name)
            estimator.fit(data=df, epochs=4, batch_size=2, validation_data=df,
                          checkpoint_trigger=EveryEpoch(),
                          feature_cols=["feature"], label_cols=["label"])
            eval_result = estimator.evaluate(df, batch_size=2,
                                             feature_cols=["feature"], label_cols=["label"])
            assert isinstance(eval_result, dict)
Ejemplo n.º 3
0
    def test_bigdl_pytorch_estimator_pandas_dataframe(self):
        class SimpleModel(nn.Module):
            def __init__(self):
                super(SimpleModel, self).__init__()
                self.fc = nn.Linear(1, 10)

            def forward(self, x):
                x = torch.unsqueeze(x, dim=1)
                x = self.fc(x)
                return F.log_softmax(x, dim=1)

        def loss_func(input, target):
            return nn.CrossEntropyLoss().forward(input,
                                                 target.flatten().long())

        model = SimpleModel()

        OrcaContext.pandas_read_backend = "pandas"
        file_path = os.path.join(resource_path,
                                 "orca/learn/simple_feature_label.csv")
        data_shard = read_csv(file_path)

        with tempfile.TemporaryDirectory() as temp_dir_name:
            estimator = Estimator.from_torch(
                model=model,
                loss=loss_func,
                metrics=[Accuracy()],
                optimizer=SGD(learningrate_schedule=Default()),
                model_dir=temp_dir_name)
            estimator.fit(data=data_shard,
                          epochs=1,
                          batch_size=4,
                          feature_cols=['feature'],
                          label_cols=['label'],
                          validation_data=data_shard,
                          checkpoint_trigger=EveryEpoch())
            estimator.evaluate(data_shard,
                               batch_size=4,
                               feature_cols=['feature'],
                               label_cols=['label'])
            est2 = Estimator.from_torch(model=model,
                                        loss=loss_func,
                                        metrics=[Accuracy()],
                                        optimizer=None)
            est2.load_orca_checkpoint(temp_dir_name)
            est2.predict(data_shard, batch_size=4, feature_cols=['feature'])
    def __init__(self,
                 model,
                 loss,
                 optimizer,
                 config=None,
                 metrics=None,
                 model_dir=None,
                 bigdl_type="float"):
        from bigdl.orca.torch import TorchModel, TorchLoss, TorchOptim
        self.loss = loss
        self.optimizer = optimizer
        self.config = {} if config is None else config

        if self.loss is None:
            self.loss = TorchLoss()
        else:
            self.loss = TorchLoss.from_pytorch(loss)
        if isinstance(model, types.FunctionType):

            def model_creator(self):
                return model(self.config)

            model = model_creator(self)
        if self.optimizer is None:
            from bigdl.orca.learn.optimizers.schedule import Default
            self.optimizer = SGD(
                learningrate_schedule=Default()).get_optimizer()
        elif isinstance(self.optimizer, TorchOptimizer):
            self.optimizer = TorchOptim.from_pytorch(self.optimizer)
        elif isinstance(self.optimizer, OrcaOptimizer):
            self.optimizer = self.optimizer.get_optimizer()
        else:
            raise ValueError(
                "Only PyTorch optimizer and orca optimizer are supported")
        from bigdl.orca.learn.metrics import Metric
        self.metrics = Metric.convert_metrics_list(metrics)
        self.log_dir = None
        self.app_name = None
        self.model_dir = model_dir
        self.model = TorchModel.from_pytorch(model)
        self.estimator = SparkEstimator(self.model,
                                        self.optimizer,
                                        model_dir,
                                        bigdl_type=bigdl_type)
    def test_estimator_keras_with_bigdl_optim_method(self):
        tf.reset_default_graph()

        model = self.create_model()

        dataset = tf.data.Dataset.from_tensor_slices((np.random.randint(0, 200, size=(100, 1)),
                                                      np.random.randint(0, 50, size=(100, 1)),
                                                      np.ones(shape=(100,), dtype=np.int32)))
        dataset = dataset.map(lambda user, item, label: [(user, item), label])
        from bigdl.orca.learn.optimizers import SGD
        from bigdl.orca.learn.optimizers.schedule import Plateau
        sgd = SGD(learningrate=0.1,
                  learningrate_schedule=Plateau("score",
                                                factor=0.1,
                                                patience=10,
                                                mode="min", ))
        est = Estimator.from_keras(keras_model=model, optimizer=sgd)
        est.fit(data=dataset,
                batch_size=8,
                epochs=10,
                validation_data=dataset)
    def test_estimator_graph_with_bigdl_optim_method(self):
        import bigdl.orca.data.pandas

        tf.reset_default_graph()

        model = SimpleModel()
        file_path = os.path.join(resource_path, "orca/learn/ncf.csv")
        data_shard = bigdl.orca.data.pandas.read_csv(file_path)

        def transform(df):
            result = {
                "x": (df['user'].to_numpy(), df['item'].to_numpy()),
                "y": df['label'].to_numpy()
            }
            return result

        data_shard = data_shard.transform_shard(transform)
        from bigdl.orca.learn.optimizers import SGD
        from bigdl.orca.learn.optimizers.schedule import Plateau
        sgd = SGD(learningrate=0.1,
                  learningrate_schedule=Plateau(
                      "score",
                      factor=0.1,
                      patience=10,
                      mode="min",
                  ))
        est = Estimator.from_graph(inputs=[model.user, model.item],
                                   labels=[model.label],
                                   outputs=[model.logits],
                                   loss=model.loss,
                                   optimizer=sgd,
                                   metrics={"loss": model.loss})
        est.fit(data=data_shard,
                batch_size=8,
                epochs=10,
                validation_data=data_shard)
Ejemplo n.º 7
0
    if warmup_iteration == 0:
        warmupDelta = 0.0
    else:
        if options.maxLr:
            maxlr = options.maxLr
        else:
            maxlr = options.learningRate
        warmupDelta = (maxlr - options.learningRate) / warmup_iteration
    polyIteration = maxIteration - warmup_iteration
    lrSchedule = SequentialSchedule(iterationPerEpoch)
    lrSchedule.add(Warmup(warmupDelta), warmup_iteration)
    lrSchedule.add(Poly(0.5, maxIteration), polyIteration)
    optim = SGD(learningrate=options.learningRate,
                learningrate_decay=0.0,
                weightdecay=options.weightDecay,
                momentum=0.9,
                dampening=0.0,
                nesterov=False,
                learningrate_schedule=lrSchedule)

    if options.maxEpoch:
        checkpoint_trigger = EveryEpoch()
    else:
        checkpoint_trigger = SeveralIteration(options.checkpointIteration)

    def calculate_top_k_accuracy(logits, targets, k=1):
        values, indices = tf.math.top_k(logits, k=k, sorted=True)
        y = tf.reshape(targets, [-1, 1])
        correct = tf.cast(tf.equal(y, indices), tf.float32)
        top_k_accuracy = tf.reduce_mean(correct) * k
        return top_k_accuracy
Ejemplo n.º 8
0
    def test_bigdl_pytorch_estimator_shard(self):
        class SimpleModel(nn.Module):
            def __init__(self):
                super(SimpleModel, self).__init__()
                self.fc = nn.Linear(2, 2)

            def forward(self, x):
                x = self.fc(x)
                return F.log_softmax(x, dim=1)

        model = SimpleModel()

        def loss_func(input, target):
            return nn.CrossEntropyLoss().forward(input,
                                                 target.flatten().long())

        def transform(df):
            result = {
                "x":
                np.stack([df['user'].to_numpy(), df['item'].to_numpy()],
                         axis=1),
                "y":
                df['label'].to_numpy()
            }
            return result

        def transform_del_y(d):
            result = {"x": d["x"]}
            return result

        OrcaContext.pandas_read_backend = "pandas"
        file_path = os.path.join(resource_path, "orca/learn/ncf.csv")
        data_shard = read_csv(file_path)
        data_shard = data_shard.transform_shard(transform)

        with tempfile.TemporaryDirectory() as temp_dir_name:
            estimator = Estimator.from_torch(
                model=model,
                loss=loss_func,
                metrics=[Accuracy()],
                optimizer=SGD(learningrate_schedule=Default()),
                model_dir=temp_dir_name)
            estimator.fit(data=data_shard,
                          epochs=4,
                          batch_size=2,
                          validation_data=data_shard,
                          checkpoint_trigger=EveryEpoch())
            state_dict1 = estimator.get_model().state_dict()

            estimator.evaluate(data_shard, batch_size=2)
            est2 = Estimator.from_torch(model=model,
                                        loss=loss_func,
                                        metrics=[Accuracy()],
                                        optimizer=None)
            est2.load_orca_checkpoint(temp_dir_name)
            state_dict2 = est2.get_model().state_dict()

            for name in state_dict1:
                para1 = state_dict1[name]
                para2 = state_dict2[name]
                assert torch.all(torch.eq(para1, para2)), "After reloading the model, " \
                                                          "%r does not match" % name

            est2.fit(data=data_shard,
                     epochs=8,
                     batch_size=2,
                     validation_data=data_shard,
                     checkpoint_trigger=EveryEpoch())
            est2.evaluate(data_shard, batch_size=2)
            pred_result = est2.predict(data_shard)
            pred_c = pred_result.collect()
            assert (pred_result, SparkXShards)
            pred_shard = data_shard.transform_shard(transform_del_y)
            pred_result2 = est2.predict(pred_shard)
            pred_c_2 = pred_result2.collect()
            assert (pred_c[0]["prediction"] == pred_c_2[0]["prediction"]).all()