Esempio n. 1
    def __getitem__(self, item):
        if not isinstance(item, (list, slice, int, basestring)):
            raise ValueError('Only support sub-setting on columns')

        col_names = self.colnames

        if isinstance(item, (int, basestring)):
            item = [item]
        elif isinstance(item, slice):
            item = range(*item.indices(len(col_names)))

        assert isinstance(item, list)

        projected_cols = []
        for x in item:
            if isinstance(x, int):
                if x < 0 or x >= len(col_names):
                    raise ValueError('Invalid column index: {}'.format(x))
            elif isinstance(x, basestring):
                if x not in col_names:
                    raise ValueError('Invalid column name: {}'.format(x))

        return DistributedDataFrame(self._jddf.getViewHandler().project(
                util.to_java_array(projected_cols, self._gateway_client.jvm.String, self._gateway_client)),
Esempio n. 2
    def predict(self, data):
        Predict the result of a sample using this ML model

        :param data:  the candidate sample data to be predicted, vector is expected
        :return: predict result, class tag for classification,
        return self._jml_model.predict(util.to_java_array(data, self._gateway_client.jvm.double, self._gateway_client))
Esempio n. 3
File: Progetto: datascibox/DDF
def logistic_regression_gd(data, step_size=1.0, max_iters=10):

    :param data:
    :param step_size:
    :param max_iters:
    ml_obj = java_gateway.get_field(data._jddf, 'ML')
    gateway = data._gateway_client
    model = ml_obj.train('logisticRegressionWithSGD',
                         util.to_java_array([max_iters, step_size],
                                            gateway.jvm.Object, gateway))
    weights = [float(model.getRawModel().intercept())] + list(model.getRawModel().weights().toArray())
    weights = pd.DataFrame(data=[weights], columns=['Intercept'] + data.colnames[:-1])
    return LogisticRegressionModel(model, gateway, weights)