def __getitem__(self, item): if not isinstance(item, (list, slice, int, basestring)): raise ValueError('Only support sub-setting on columns') col_names = self.colnames if isinstance(item, (int, basestring)): item = [item] elif isinstance(item, slice): item = range(*item.indices(len(col_names))) assert isinstance(item, list) projected_cols = [] for x in item: if isinstance(x, int): if x < 0 or x >= len(col_names): raise ValueError('Invalid column index: {}'.format(x)) projected_cols.append(col_names[x]) elif isinstance(x, basestring): if x not in col_names: raise ValueError('Invalid column name: {}'.format(x)) projected_cols.append(x) return DistributedDataFrame( self._jddf.getViewHandler().project( util.to_java_array(projected_cols, self._gateway_client.jvm.String, self._gateway_client)), self._gateway_client)
def __getitem__(self, item): if not isinstance(item, (list, slice, int, basestring)): raise ValueError('Only support sub-setting on columns') col_names = self.colnames if isinstance(item, (int, basestring)): item = [item] elif isinstance(item, slice): item = range(*item.indices(len(col_names))) assert isinstance(item, list) projected_cols = [] for x in item: if isinstance(x, int): if x < 0 or x >= len(col_names): raise ValueError('Invalid column index: {}'.format(x)) projected_cols.append(col_names[x]) elif isinstance(x, basestring): if x not in col_names: raise ValueError('Invalid column name: {}'.format(x)) projected_cols.append(x) return DistributedDataFrame(self._jddf.getViewHandler().project( util.to_java_array(projected_cols, self._gateway_client.jvm.String, self._gateway_client)), self._gateway_client)
def predict(self, data): """ Predict the result of a sample using this ML model :param data: the candidate sample data to be predicted, vector is expected :return: predict result, class tag for classification, """ return self._jml_model.predict(util.to_java_array(data, self._gateway_client.jvm.double, self._gateway_client))
def logistic_regression_gd(data, step_size=1.0, max_iters=10): """ :param data: :param step_size: :param max_iters: :return: """ ml_obj = java_gateway.get_field(data._jddf, 'ML') gateway = data._gateway_client model = ml_obj.train('logisticRegressionWithSGD', util.to_java_array([max_iters, step_size], gateway.jvm.Object, gateway)) weights = [float(model.getRawModel().intercept())] + list(model.getRawModel().weights().toArray()) weights = pd.DataFrame(data=[weights], columns=['Intercept'] + data.colnames[:-1]) return LogisticRegressionModel(model, gateway, weights)
def logistic_regression_gd(data, step_size=1.0, max_iters=10): """ :param data: :param step_size: :param max_iters: :return: """ ml_obj = java_gateway.get_field(data._jddf, 'ML') gateway = data._gateway_client model = ml_obj.train( 'logisticRegressionWithSGD', util.to_java_array([max_iters, step_size], gateway.jvm.Object, gateway)) weights = [float(model.getRawModel().intercept())] + list( model.getRawModel().weights().toArray()) weights = pd.DataFrame(data=[weights], columns=['Intercept'] + data.colnames[:-1]) return LogisticRegressionModel(model, gateway, weights)