Example #1
0
    def normalize(self):
        """
        Do normalization on tokens.
        Need to tokenize first.
        See Normalizer for more details.

        :return: TextSet after normalization.
        """
        jvalue = callZooFunc(self.bigdl_type, "textSetNormalize", self.value)
        return TextSet(jvalue=jvalue)
Example #2
0
    def predict(self, inputs):
        """
        Do prediction on inputs.

        :param inputs: A numpy array or a list of numpy arrays or JTensor or a list of JTensors.
        """
        jinputs, input_is_table = Layer.check_input(inputs)
        output = callZooFunc(self.bigdl_type, "inferenceModelPredict",
                             self.value, jinputs, input_is_table)
        return KerasNet.convert_output(output)
Example #3
0
    def distributed_predict(self, inputs, sc):
        data_type = inputs.map(lambda x: x.__class__.__name__).first()
        input_is_table = False
        if data_type == "list":
            input_is_table = True
        jinputs = inputs.map(lambda x: Layer.check_input(x)[0])

        output = callZooFunc(self.bigdl_type, "inferenceModelDistriPredict",
                             self.value, sc, jinputs, input_is_table)
        return output.map(lambda x: KerasNet.convert_output(x))
Example #4
0
    def set_word_index(self, vocab):
        """
        Assign a word_index dictionary for this TextSet to use during word2idx.
        If you load the word_index from the saved file, you are recommended to use `load_word_index`
        directly.

        :return: TextSet with the word_index set.
        """
        jvalue = callZooFunc(self.bigdl_type, "textSetSetWordIndex", self.value, vocab)
        return TextSet(jvalue=jvalue)
Example #5
0
    def generate_sample(self):
        """
        Generate BigDL Sample.
        Need to word2idx first.
        See TextFeatureToSample for more details.

        :return: TextSet with Samples.
        """
        jvalue = callZooFunc(self.bigdl_type, "textSetGenerateSample", self.value)
        return TextSet(jvalue=jvalue)
Example #6
0
    def load_bigdl(model_path, weight_path=None, bigdl_type="float"):
        """
        Load a pre-trained BigDL model.

        :param model_path: The path to the pre-trained model.
        :param weight_path: The path to the weights of the pre-trained model. Default is None.
        :return: A pre-trained model.
        """
        jmodel = callZooFunc(bigdl_type, "netLoadBigDL", model_path,
                             weight_path)
        return GraphNet.from_jvalue(jmodel)
Example #7
0
 def get_predict(self, key="predict"):
     """
     get prediction list from ImageSet
     """
     predicts = callZooFunc(self.bigdl_type, "localImageSetToPredict",
                            self.value, key)
     return list(
         map(
             lambda predict:
             (predict[0], list(map(lambda x: x.to_ndarray(), predict[1])))
             if predict[1] else (predict[0], None), predicts))
Example #8
0
 def tf_dataset(cls, func, total_size, bigdl_type="float"):
     """
     :param func: a function return a tensorflow dataset
     :param total_size: total size of this dataset
     :param bigdl_type: numeric type
     :return: A feature set
     """
     func = CloudPickleSerializer.dumps(CloudPickleSerializer, func)
     jvalue = callZooFunc(bigdl_type, "createFeatureSetFromTfDataset", func,
                          total_size)
     return cls(jvalue=jvalue)
Example #9
0
    def to_local(self):
        """
        Convert to a LocalTextSet.

        :return: LocalTextSet
        """
        if self.is_local():
            jvalue = self.value
        else:
            jvalue = callZooFunc(self.bigdl_type, "textSetToLocal", self.value)
        return LocalTextSet(jvalue=jvalue)
Example #10
0
 def __call__(self, x):
     """
     Some other modules point to current module
     :param x: input variables. x is either a Variable or list of Variable.
     :return: Variable containing current module
     """
     from bigdl.dllib.keras.autograd import Variable
     return Variable.from_jvalue(callZooFunc(self.bigdl_type,
                                             "connectInputs",
                                             self,
                                             to_list(x)))
Example #11
0
 def train_imagefeature(self, train_set, criterion, end_trigger=None, checkpoint_trigger=None,
                        validation_set=None, validation_method=None, batch_size=32):
     """
     Train model with provided imageFeature trainSet and criterion.
     The training will end until the endTrigger is triggered.
     During the training, if checkPointTrigger is defined and triggered,
     the model will be saved to modelDir. And if validationSet and validationMethod
     is defined, the model will be evaluated at the checkpoint.
     :param train_set: training FeatureSet, a FeatureSet[ImageFeature]
     :param criterion: Loss function
     :param end_trigger: When to finish the training
     :param checkpoint_trigger: When to save a checkpoint and evaluate model.
     :param validation_set: Validation FeatureSet, a FeatureSet[Sample[T]]
     :param validation_method: Validation Methods.
     :param batch_size: Batch size
     :return:
     """
     callZooFunc(self.bigdl_type, "estimatorTrainImageFeature", self.value, train_set,
                 criterion, end_trigger, checkpoint_trigger, validation_set,
                 validation_method, batch_size)
     return self
Example #12
0
    def read_parquet(path, sc, bigdl_type="float"):
        """
        Read relations from parquet file.
        Schema should be the following:
        "id1"(string), "id2"(string) and "label"(int).

        :param path: The path to the parquet file.
        :param sc: An instance of SparkContext.
        :return: RDD of Relation.
        """
        jvalue = callZooFunc(bigdl_type, "readRelationsParquet", path, sc)
        return jvalue.map(lambda x: Relation(str(x[0]), str(x[1]), int(x[2])))
Example #13
0
    def load_tensorflow(self,
                        model_path,
                        model_type="frozenModel",
                        intra_op_parallelism_threads=1,
                        inter_op_parallelism_threads=1,
                        use_per_session_threads=True):
        """
        Load a TensorFlow model using tensorflow.

        :param model_path: String. The file path to the TensorFlow model.
        :param model_type: String. The type of the tensorflow model file. Default is "frozenModel"
        :param intra_op_parallelism_threads: Int. The number of intraOpParallelismThreads.
                                             Default is 1.
        :param inter_op_parallelism_threads: Int. The number of interOpParallelismThreads.
                                             Default is 1.
        :param use_per_session_threads: Boolean. Whether to use perSessionThreads. Default is True.
        """
        callZooFunc(self.bigdl_type, "inferenceModelLoadTensorFlow",
                    self.value, model_path, model_type,
                    intra_op_parallelism_threads, inter_op_parallelism_threads,
                    use_per_session_threads)
Example #14
0
    def generate_word_index_map(self, remove_topN=0, max_words_num=-1,
                                min_freq=1, existing_map=None):
        """
        Generate word_index map based on sorted word frequencies in descending order.
        Return the result dictionary, which can also be retrieved by 'get_word_index()'.
        Make sure you call this after tokenize. Otherwise you will get an error.
        See word2idx for more details.

        :return: Dictionary {word: id}
        """
        return callZooFunc(self.bigdl_type, "textSetGenerateWordIndexMap", self.value,
                           remove_topN, max_words_num, min_freq, existing_map)
Example #15
0
 def forward(self, input):
     """
     NB: It's for debug only, please use optimizer.optimize() in production.
     Takes an input object, and computes the corresponding output of the module
     :param input: ndarray or list of ndarray
     :param input: ndarray or list of ndarray or JTensor or list of JTensor.
     :return: ndarray or list of ndarray
     """
     jinput, input_is_table = self.check_input(input)
     output = callZooFunc(self.bigdl_type, "zooForward", self.value, jinput,
                          input_is_table)
     return self.convert_output(output)
Example #16
0
def sum(x, axis=0, keepDims=False):
    """
    Sum of the values in a a variable, alongside the specified axis.
    :param x: A variable.
    :param axis: An integer. Axes to compute the sum over.
    :param keepDims: A boolean, whether to keep the dimensions or not.
            If `keepDims` is `False`, the rank of the variable is reduced
            by 1 for each entry in `axis`. If `keepDims` is `True`,
            the reduced dimensions are retained with length 1.
    :return: A variable with sum of `x`.
    """
    return Variable.from_jvalue(callZooFunc("float", "sum", x, axis, keepDims))
Example #17
0
    def shape_sequence(self, len, trunc_mode="pre", pad_element=0):
        """
        Shape the sequence of indices to a fixed length.
        Need to word2idx first.
        See SequenceShaper for more details.

        :return: TextSet after sequence shaping.
        """
        assert isinstance(pad_element, int), "pad_element should be an int"
        jvalue = callZooFunc(self.bigdl_type, "textSetShapeSequence", self.value,
                             len, trunc_mode, pad_element)
        return TextSet(jvalue=jvalue)
Example #18
0
    def load(model_path, weight_path=None, bigdl_type="float"):
        """
        Load an existing Analytics Zoo model defined in Keras-style(with weights).

        :param model_path: The path to load the saved model.
                          Local file system, HDFS and Amazon S3 are supported.
                          HDFS path should be like 'hdfs://[host]:[port]/xxx'.
                          Amazon S3 path should be like 's3a://bucket/xxx'.
        :param weight_path: The path for pre-trained weights if any. Default is None.
        :return: An Analytics Zoo model.
        """
        jmodel = callZooFunc(bigdl_type, "netLoad", model_path, weight_path)
        return Net.from_jvalue(jmodel, bigdl_type)
Example #19
0
    def read_parquet(cls, path, sc, bigdl_type="float"):
        """
        Read texts with id from parquet file.
        Schema should be the following:
        "id"(string) and "text"(string).

        :param path: The path to the parquet file.
        :param sc: An instance of SparkContext.

        :return: DistributedTextSet.
        """
        jvalue = callZooFunc(bigdl_type, "textSetReadParquet", path, sc)
        return DistributedTextSet(jvalue=jvalue)
Example #20
0
 def __call__(self, x=None):
     """
     Some other modules point to current module
     :param x: upstream module nodes. x is either a Node or list of Node.
     :return: node containing current module
     """
     x = to_list(x if x else [])
     layer = self
     if isinstance(self, Lambda):
         input_shapes = [var.get_output_shape() for var in x]
         layer = self.create(remove_batch(input_shapes))
     return Variable.from_jvalue(
         callZooFunc(self.bigdl_type, "connectInputs", layer, to_list(x)))
Example #21
0
    def pytorch_dataloader(cls,
                           dataloader,
                           features="_data[0]",
                           labels="_data[1]",
                           bigdl_type="float"):
        """
        Create FeatureSet from pytorch dataloader
        :param dataloader: a pytorch dataloader, or a function return pytorch dataloader.
        :param features: features in _data, _data is get from dataloader.
        :param labels: lables in _data, _data is get from dataloader.
        :param bigdl_type: numeric type
        :return: A feature set
        """
        import torch
        if isinstance(dataloader, torch.utils.data.DataLoader):
            node_num, core_num = get_node_and_core_number()
            if dataloader.batch_size % node_num != 0:
                true_bs = math.ceil(
                    dataloader.batch_size / node_num) * node_num
                warning_msg = "Detect dataloader's batch_size is not divisible by node number(" + \
                              str(node_num) + "), will adjust batch_size to " + str(true_bs) + \
                              " automatically"
                warnings.warn(warning_msg)

            bys = CloudPickleSerializer.dumps(CloudPickleSerializer,
                                              dataloader)
            jvalue = callZooFunc(bigdl_type, "createFeatureSetFromPyTorch",
                                 bys, False, features, labels)
            return cls(jvalue=jvalue)
        elif callable(dataloader):
            bys = CloudPickleSerializer.dumps(CloudPickleSerializer,
                                              dataloader)
            jvalue = callZooFunc(bigdl_type, "createFeatureSetFromPyTorch",
                                 bys, True, features, labels)
            return cls(jvalue=jvalue)
        else:
            raise ValueError(
                "Unsupported dataloader type, please pass pytorch dataloader" +
                " or a function to create pytorch dataloader.")
Example #22
0
    def from_saved_model(model_path, tag=None, signature=None,
                         inputs=None, outputs=None, tf_session_config=None, init_op=None):
        """
        Create a TFNet from an TensorFlow saved model
        :param model_path: the path to the SavedModel path
        :param tag: the tag to load in the saved model, default to "serve"
        :param signature: The signature of the SignatureDef that defines inputs
                          and outputs of the graph. TFNet assumes inputs is sorted
                          by their corresponding key in SignatureDef.
        :param inputs: a list input tensor names of this model, you may want to use TensorFlow's
                      command line tool to inspect the saved model to find the input tensor
                      names e.g. `saved_model_cli show --dir {saved_model_path} --all`
        :param outputs: a list output tensor names of this model, you may want to use TensorFlow's
                      command line tool to inspect the saved model to find the output tensor
                      names e.g. `saved_model_cli show --dir {saved_model_path} --all`
        :param tf_session_config: an optional tf.ConfigProto object to
                       set the session config in java side.
                       This config does not necessarily be the same with your current session.
                       E.g. sess_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                                                         intra_op_parallelism_threads=1)
                            net = TFNet.from_session(sess, inputs, outputs, sess_config)
        :return: a TFNet
        """
        config_bytes = None
        if tf_session_config is not None:
            import tensorflow as tf
            assert isinstance(tf_session_config, tf.ConfigProto)
            tf_session_config.use_per_session_threads = True
            config_bytes = bytearray(tf_session_config.SerializeToString())

        if inputs is None or outputs is None:
            jvalue = callZooFunc("float", "createTFNetFromSavedModel",
                                 model_path, tag, signature, config_bytes)
        else:

            jvalue = callZooFunc("float", "createTFNetFromSavedModel",
                                 model_path, tag, inputs, outputs, config_bytes, init_op)
        return TFNet(path=None, jvalue=jvalue)
Example #23
0
    def predict(self, x, batch_per_thread=4, distributed=True):
        """
        Use a model to do prediction.

        # Arguments
        x: Prediction data. A Numpy array or RDD of Sample or ImageSet.
        batch_per_thread:
          The default value is 4.
          When distributed is True,the total batch size is batch_per_thread * rdd.getNumPartitions.
          When distributed is False the total batch size is batch_per_thread * numOfCores.
        distributed: Boolean. Whether to do prediction in distributed mode or local mode.
                     Default is True. In local mode, x must be a Numpy array.
        """
        if isinstance(x, ImageSet) or isinstance(x, TextSet):
            results = callZooFunc(self.bigdl_type, "zooPredict", self.value, x,
                                  batch_per_thread)
            return ImageSet(results) if isinstance(
                x, ImageSet) else TextSet(results)
        if distributed:
            if isinstance(x, np.ndarray):
                data_rdd = to_sample_rdd(x, np.zeros([x.shape[0]]))
            elif isinstance(x, RDD):
                data_rdd = x
            else:
                raise TypeError("Unsupported prediction data type: %s" %
                                type(x))
            results = callZooFunc(self.bigdl_type, "zooPredict", self.value,
                                  data_rdd, batch_per_thread)
            return results.map(lambda result: Layer.convert_output(result))
        else:
            if isinstance(x, np.ndarray) or isinstance(x, list):
                results = callZooFunc(self.bigdl_type,
                                      "zooPredict", self.value,
                                      self._to_jtensors(x), batch_per_thread)
                return [Layer.convert_output(result) for result in results]
            else:
                raise TypeError("Unsupported prediction data type: %s" %
                                type(x))
Example #24
0
    def get_predicts(self):
        """
        Get the prediction results (if any) combined with uris (if any) of a TextSet.
        If a text doesn't have a uri, its corresponding uri will be None.
        If a text hasn't been predicted by a model, its corresponding prediction will be None.

        :return: List of (uri, prediction as a list of numpy array) for LocalTextSet.
                 RDD of (uri, prediction as a list of numpy array) for DistributedTextSet.
        """
        predicts = callZooFunc(self.bigdl_type, "textSetGetPredicts", self.value)
        if isinstance(predicts, RDD):
            return predicts.map(lambda predict: (predict[0], _process_predict_result(predict[1])))
        else:
            return [(predict[0], _process_predict_result(predict[1])) for predict in predicts]
Example #25
0
    def get_train_summary(self, tag=None):
        """
        Get the scalar from model train summary
        Return 2-D array like object which could be converted
        by nd.array()
        # Arguments
        tag: The string variable represents the scalar wanted
        """
        # exception handle
        if tag != "Loss" and tag != "LearningRate" and tag != "Throughput":
            raise TypeError('Only "Loss", "LearningRate", "Throughput"' +
                            'are supported in train summary')

        return callZooFunc(self.bigdl_type, "zooGetScalarFromSummary",
                           self.value, tag, "Train")
Example #26
0
    def from_rdds(cls, image_rdd, label_rdd=None, bigdl_type="float"):
        """
        Create a ImageSet from rdds of ndarray.

        :param image_rdd: a rdd of ndarray, each ndarray should has dimension of 3 or 4 (3D images)
        :param label_rdd: a rdd of ndarray
        :return: a DistributedImageSet
        """
        image_rdd = image_rdd.map(lambda x: JTensor.from_ndarray(x))
        if label_rdd is not None:
            label_rdd = label_rdd.map(lambda x: JTensor.from_ndarray(x))
        return ImageSet(jvalue=callZooFunc(bigdl_type,
                                           "createDistributedImageSet",
                                           image_rdd, label_rdd),
                        bigdl_type=bigdl_type)
Example #27
0
    def word2idx(self, remove_topN=0, max_words_num=-1, min_freq=1, existing_map=None):
        """
        Map word tokens to indices.
        Important: Take care that this method behaves a bit differently for training and inference.

        ---------------------------------------Training--------------------------------------------
        During the training, you need to generate a new word_index dictionary according to the texts
        you are dealing with. Thus this method will first do the dictionary generation and then
        convert words to indices based on the generated dictionary.

        You can specify the following arguments which pose some constraints when generating
        the dictionary.
        In the result dictionary, index will start from 1 and corresponds to the occurrence
        frequency of each word sorted in descending order.
        Here we adopt the convention that index 0 will be reserved for unknown words.
        After word2idx, you can get the generated word_index dictionary by calling 'get_word_index'.
        Also, you can call `save_word_index` to save this word_index dictionary to be used in
        future training.

        :param remove_topN: Non-negative int. Remove the topN words with highest frequencies
                            in the case where those are treated as stopwords.
                            Default is 0, namely remove nothing.
        :param max_words_num: Int. The maximum number of words to be taken into consideration.
                              Default is -1, namely all words will be considered.
                              Otherwise, it should be a positive int.
        :param min_freq: Positive int. Only those words with frequency >= min_freq will be taken
                         into consideration.
                         Default is 1, namely all words that occur will be considered.
        :param existing_map: Existing dictionary of word_index if any.
                             Default is None and in this case a new dictionary with index starting
                             from 1 will be generated.
                             If not None, then the generated dictionary will preserve the word_index
                             in existing_map and assign subsequent indices to new words.

        ---------------------------------------Inference--------------------------------------------
        During the inference, you are supposed to use exactly the same word_index dictionary as in
        the training stage instead of generating a new one.
        Thus please be aware that you do not need to specify any of the above arguments.
        You need to call `load_word_index` or `set_word_index` beforehand for dictionary loading.

        Need to tokenize first.
        See WordIndexer for more details.

        :return: TextSet after word2idx.
        """
        jvalue = callZooFunc(self.bigdl_type, "textSetWord2idx", self.value,
                             remove_topN, max_words_num, min_freq, existing_map)
        return TextSet(jvalue=jvalue)
Example #28
0
    def to_distributed(self, sc=None, partition_num=4):
        """
        Convert to a DistributedTextSet.

        Need to specify SparkContext to convert a LocalTextSet to a DistributedTextSet.
        In this case, you may also want to specify partition_num, the default of which is 4.

        :return: DistributedTextSet
        """
        if self.is_distributed():
            jvalue = self.value
        else:
            assert sc, "sc cannot be null to transform a LocalTextSet to a DistributedTextSet"
            jvalue = callZooFunc(self.bigdl_type, "textSetToDistributed", self.value,
                                 sc, partition_num)
        return DistributedTextSet(jvalue=jvalue)
Example #29
0
    def backward(self, y_true, y_pred):
        """
        NB: It's for debug only, please use optimizer.optimize() in production.
        Performs a back-propagation step through the criterion, with respect to the given input.

        :param input: ndarray or list of ndarray
        :param target: ndarray or list of ndarray
        :return: ndarray
        """
        input = y_pred
        target = y_true
        jinput, input_is_table = Layer.check_input(input)
        jtarget, target_is_table = Layer.check_input(target)
        output = callZooFunc(self.bigdl_type, "criterionBackward", self.value,
                             jinput, input_is_table, jtarget, target_is_table)
        return Layer.convert_output(output)
Example #30
0
    def get_word_index(embedding_file, bigdl_type="float"):
        """
        Get the full wordIndex map from the given embedding_file.

        # Arguments
        embedding_file: The path to the embedding file.
                        Currently only the following GloVe files are supported:
                        "glove.6B.50d.txt", "glove.6B.100d.txt", "glove.6B.200d.txt"
                        "glove.6B.300d.txt", "glove.42B.300d.txt", "glove.840B.300d.txt".
                        You can download them from: https://nlp.stanford.edu/projects/glove/.

        # Return
        Dictionary of word (string) and its corresponding index (int) obtained from
        the given embedding file.
        """
        return callZooFunc(bigdl_type, "wordEmbeddingGetWordIndex",
                           embedding_file)