def image_frame(cls, image_frame, memory_type="DRAM", sequential_order=False, shuffle=True, bigdl_type="float"): """ Create FeatureSet from ImageFrame. :param image_frame: ImageFrame :param memory_type: string, DRAM, PMEM or a Int number. If it's DRAM, will cache dataset into dynamic random-access memory If it's PMEM, will cache dataset into Intel Optane DC Persistent Memory If it's a Int number n, will cache dataset into disk, and only hold 1/n of the data into memory during the training. After going through the 1/n, we will release the current cache, and load another 1/n into memory. :param sequential_order: whether to iterate the elements in the feature set in sequential order for training. :param shuffle: whether to shuffle the elements in each partition before each epoch when training :param bigdl_type: numeric type :return: A feature set """ jvalue = callZooFunc(bigdl_type, "createFeatureSetFromImageFrame", image_frame, memory_type, sequential_order, shuffle) return cls(jvalue=jvalue)
def from_pytorch(model): """ Create a TorchModel directly from PyTorch model, e.g. model in torchvision.models. :param model: a PyTorch model, or a function to create PyTorch model """ weights = [] import types if isinstance(model, types.FunctionType) or isinstance( model, types.ClassType): for param in trainable_param(model()): weights.append(param.view(-1)) else: for param in trainable_param(model): weights.append(param.view(-1)) flatten_weight = torch.nn.utils.parameters_to_vector( weights).data.numpy() bys = io.BytesIO() torch.save(model, bys, pickle_module=zoo_pickle_module) weights = JTensor.from_ndarray(flatten_weight) jvalue = callZooFunc("float", "createTorchModel", bys.getvalue(), weights) net = TorchModel(jvalue, bys.getvalue()) return net
def from_relation_lists(cls, relations, corpus1, corpus2, bigdl_type="float"): """ Used to generate a TextSet for ranking. This method does the following: 1. For each id1 in relations, find the list of id2 with corresponding label that comes together with id1. In other words, group relations by id1. 2. Join with corpus to transform each id to indexedTokens. Note: Make sure that the corpus has been transformed by SequenceShaper and WordIndexer. 3. For each list, generate a TextFeature having Sample with: - feature of shape (list_length, text1_length + text2_length). - label of shape (list_length, 1). :param relations: List or RDD of Relation. :param corpus1: TextSet that contains all id1 in relations. For each TextFeature in corpus1, text must have been transformed to indexedTokens of the same length. :param corpus2: TextSet that contains all id2 in relations. For each TextFeature in corpus2, text must have been transformed to indexedTokens of the same length. Note that if relations is a list, then corpus1 and corpus2 must both be LocalTextSet. If relations is RDD, then corpus1 and corpus2 must both be DistributedTextSet. :return: TextSet. """ if isinstance(relations, RDD): relations = relations.map(lambda x: x.to_tuple()) elif isinstance(relations, list): relations = [relation.to_tuple() for relation in relations] else: raise TypeError("relations should be RDD or list of Relation") jvalue = callZooFunc(bigdl_type, "textSetFromRelationLists", relations, corpus1, corpus2) return TextSet(jvalue=jvalue)
def read(cls, path, sc=None, min_partitions=1, resize_height=-1, resize_width=-1, image_codec=-1, with_label=False, one_based_label=True, bigdl_type="float"): """ Read images as Image Set :param path: path to read images if sc is defined, path can be local or HDFS. Wildcard character are supported. if withLabel is set to true, path should be a directory that have two levels. The first level is class folders, and the second is images. All images belong to a same class should be put into the same class folder. So each image in the path is labeled by the folder it belongs. :param sc: SparkContext :param min_partitions: A suggestion value of the minimal splitting number for input data. :param resize_height: height after resize, by default is -1 which will not resize the image :param resize_width: width after resize, by default is -1 which will not resize the image :param image_codec: specifying the color type of a loaded image, same as in OpenCV.imread.By default is Imgcodecs.CV_LOAD_IMAGE_UNCHANGED(-1) :param with_label: whether to treat folders in the path as image classification labels and read the labels into ImageSet. :param one_based_label: whether to use one based label :return: ImageSet """ return ImageSet( jvalue=callZooFunc(bigdl_type, "readImageSet", path, sc, min_partitions, resize_height, resize_width, image_codec, with_label, one_based_label))
def init_from_existing_model(path, weight_path=None, input_seq_len=-1.0, hidden_drop=-1.0, attn_drop=-1.0, output_all_block=True, bigdl_type="float"): """ Load an existing BERT model (with weights). # Arguments path: The path for the pre-defined model. Local file system, HDFS and Amazon S3 are supported. HDFS path should be like 'hdfs://[host]:[port]/xxx'. Amazon S3 path should be like 's3a://bucket/xxx'. weight_path: The path for pre-trained weights if any. Default is None. """ jlayer = callZooFunc(bigdl_type, "loadBERT", path, weight_path, input_seq_len, hidden_drop, attn_drop, output_all_block) model = Layer(jvalue=jlayer, bigdl_type=bigdl_type) model.__class__ = BERT return model
def predict_classes(self, x, batch_per_thread=4, zero_based_label=True): """ Use a model to predict for classes. By default, label predictions start from 0. # Arguments x: Prediction data. A Numpy array or RDD of Sample. batch_per_partition: The default value is 4. When distributed is True, the total batch size is batch_per_thread * rdd.getNumPartitions. When distributed is False the total batch size is batch_per_thread * numOfCores. zero_based_label: Boolean. Whether result labels start from 0. Default is True. If False, result labels start from 1. """ if isinstance(x, np.ndarray): data_rdd = to_sample_rdd(x, np.zeros([x.shape[0]])) elif isinstance(x, RDD): data_rdd = x else: raise TypeError("Unsupported prediction data type: %s" % type(x)) return callZooFunc(self.bigdl_type, "zooPredictClasses", self.value, data_rdd, batch_per_thread, zero_based_label)
def save_checkpoint(self): callZooFunc(self.bigdl_type, "saveCheckpoint", self.value)
def __init__(self, bigdl_type="float", *args): self.bigdl_type = bigdl_type self.value = callZooFunc(bigdl_type, JavaValue.jvm_class_constructor(self), *args)
def get_optimizer_version(bigdl_type="float"): """ Get DistriOptimizer version. return optimizerVersion """ return callZooFunc(bigdl_type, "getOptimizerVersion")
def load(path): jvalue = callZooFunc("float", "loadXGBRegressorModel", path) return XGBRegressorModel(jvalue=jvalue)
def transform(self, dataset): df = callZooFunc("float", "transformXGBRegressorModel", self.value, dataset) return df
def setPredictionCol(self, prediction): callZooFunc("float", "setPredictionXGBRegressorModel", self.value, prediction)
def load(path): jvalue = callZooFunc("float", "loadNNClassifierModel", path) return NNClassifierModel(model=None, feature_preprocessing=None, jvalue=jvalue)
def __init__(self, model, criterion, feature_preprocessing=None, label_preprocessing=None, jvalue=None, bigdl_type="float"): """ Construct a NNEstimator with BigDL model, criterion and Preprocessing for feature and label data. :param model: BigDL Model to be trained. :param criterion: BigDL criterion. :param feature_preprocessing: The param converts the data in feature column to a Tensor or to a Sample directly. It expects a List of Int as the size of the converted Tensor, or a Preprocessing[F, Tensor[T]] If a List of Int is set as feature_preprocessing, it can only handle the case that feature column contains the following data types: Float, Double, Int, Array[Float], Array[Double], Array[Int] and MLlib Vector. The feature data are converted to Tensors with the specified sizes before sending to the model. Internally, a SeqToTensor is generated according to the size, and used as the feature_preprocessing. Alternatively, user can set feature_preprocessing as Preprocessing[F, Tensor[T]] that transforms the feature data to a Tensor[T]. Some pre-defined Preprocessing are provided in package zoo.feature. Multiple Preprocessing can be combined as a ChainedPreprocessing. The feature_preprocessing will also be copied to the generated NNModel and applied to feature column during transform. :param label_preprocessing: similar to feature_preprocessing, but applies to Label data. :param jvalue: Java object create by Py4j :param bigdl_type: optional parameter. data type of model, "float"(default) or "double". """ super(NNEstimator, self).__init__() # avoid initialization during import. if not feature_preprocessing: feature_preprocessing = SeqToTensor() if not label_preprocessing: label_preprocessing = SeqToTensor() if type(feature_preprocessing) is list: if type(feature_preprocessing[0]) is list: feature_preprocessing = SeqToMultipleTensors( feature_preprocessing) elif isinstance(feature_preprocessing[0], int): feature_preprocessing = SeqToTensor(feature_preprocessing) if type(label_preprocessing) is list: assert (all(isinstance(x, int) for x in label_preprocessing)) label_preprocessing = SeqToTensor(label_preprocessing) sample_preprocessing = FeatureLabelPreprocessing( feature_preprocessing, label_preprocessing) self.value = jvalue if jvalue else callZooFunc( bigdl_type, self.jvm_class_constructor(), model, criterion, sample_preprocessing) self.model = model self.samplePreprocessing = sample_preprocessing self.bigdl_type = bigdl_type self._java_obj = self.value self.maxEpoch = Param(self, "maxEpoch", "number of max Epoch") self.learningRate = Param(self, "learningRate", "learning rate") self.learningRateDecay = Param(self, "learningRateDecay", "learning rate decay") self.cachingSample = Param(self, "cachingSample", "cachingSample") self.train_summary = None self.validation_config = None self.checkpoint_config = None self.validation_summary = None self.endWhen = None self.dataCacheLevel = "DRAM"
def write_parquet(df, path, mode): callZooFunc("float", "dfWriteParquet", df, path, mode)
def fit(self, df): return callZooFunc("float", "fitXGBRegressor", self.value, df)
def setFeaturesCol(self, features): callZooFunc("float", "setFeaturesXGBRegressorModel", self.value, features)
def setFeaturesCol(self, features): callZooFunc("float", "setFeaturesXGBClassifierModel", self.value, features)
def setInferBatchSize(self, value: int): callZooFunc("float", "setInferBatchSizeXGBRegressorModel", self.value, value)
def setPredictionCol(self, prediction): callZooFunc("float", "setPredictionXGBClassifierModel", self.value, prediction)
def save(self, path): print("start saving in python side") callZooFunc("float", "saveXGBRegressorModel", self.value, path)
def transform(self, dataset): df = callZooFunc("float", "transformXGBClassifierModel", self.value, dataset) return df
def set_optimizer_version(optimizerVersion, bigdl_type="float"): """ Set DistriOptimizer version. param optimizerVersion: should be "OptimizerV1" or "OptimizerV2". """ callZooFunc(bigdl_type, "setOptimizerVersion", optimizerVersion)
def __init__(self): super(XGBRegressor, self).__init__() bigdl_type = "float" self.value = callZooFunc("float", "getXGBRegressor")
def __init__(self, module_bytes, weights, bigdl_type="float"): weights = JTensor.from_ndarray(weights) self.module_bytes = module_bytes self.value = callZooFunc(bigdl_type, self.jvm_class_constructor(), module_bytes, weights) self.bigdl_type = bigdl_type
def setNthread(self, value: int): callZooFunc("float", "setXGBRegressorNthread", self.value, value)
def load_checkpoint(self, path): callZooFunc(self.bigdl_type, "loadZooCheckpoint", self.value, path) self.get_weights_to_python()
def setNumRound(self, value: int): callZooFunc("float", "setXGBRegressorNumRound", self.value, value)
def standardScale(df): return callZooFunc("float", "standardScaleDF", df)
def setNumWorkers(self, value: int): callZooFunc("float", "setXGBRegressorNumWorkers", self.value, value)