Example #1
0
    def read_rdd(esConfig, esResource=None, filter=None, esQuery=None):
        """
        Read the data from elastic search into Spark RDD.

        :param esConfig: Dictionary which represents configuration for
               elastic search(eg. ip, port, es query etc).
        :param esResource: Optional. resource file in elastic search.
               It also can be set in esConfig
        :param filter: Optional. Request only those fields from Elasticsearch
        :param esQuery: Optional. es query
        :return: Spark RDD
        """
        sc = init_nncontext()
        if "es.resource" not in esConfig:
            esConfig["es.resource"] = esResource
        if filter is not None:
            esConfig["es.read.source.filter"] = filter
        if esQuery is not None:
            esConfig["es.query"] = esQuery
        rdd = sc.newAPIHadoopRDD(
            "org.elasticsearch.hadoop.mr.EsInputFormat",
            "org.apache.hadoop.io.NullWritable",
            "org.elasticsearch.hadoop.mr.LinkedMapWritable",
            conf=esConfig)
        return rdd
Example #2
0
    def partition(data, num_shards=None):
        """
        Partition local in memory data and form a SparkXShards
        :param data: np.ndarray, a tuple, list, dict of np.ndarray, or a nested structure
        made of tuple, list, dict with ndarray as the leaf value
        :param num_shards: the number of shards that the data will be partitioned into
        :return: a SparkXShards
        """
        sc = init_nncontext()
        node_num, core_num = get_node_and_core_number()
        shard_num = node_num * core_num if num_shards is None else num_shards
        import numpy as np
        type_err_msg = """
The types supported in zoo.orca.data.XShards.partition are
1. np.ndarray
2. a tuple, list, dict of np.ndarray
3. nested structure made of tuple, list, dict with ndarray as the leaf value

But got data of type {}
        """.format(type(data))
        supported_types = {list, tuple, dict}
        if isinstance(data, np.ndarray):
            if data.shape[0] < shard_num:
                raise ValueError(
                    "The length of data {} is smaller than the total number "
                    "of shards {}. Please adjust the num_shards option to be "
                    "at most {}.".format(data.shape[0], shard_num,
                                         data.shape[0]))
            arrays = np.array_split(data, shard_num)
            rdd = sc.parallelize(arrays)
        else:
            assert type(data) in supported_types, type_err_msg
            flattened = nest.flatten(data)
            data_length = len(flattened[0])
            data_to_be_shard = []
            if data_length < shard_num:
                raise ValueError(
                    "The length of data {} is smaller than the total number "
                    "of shards {}. Please adjust the num_shards option to be "
                    "at most {}.".format(data_length, shard_num, data_length))
            for i in range(shard_num):
                data_to_be_shard.append([])
            for x in flattened:
                assert len(x) == data_length, \
                    "the ndarrays in data must all have the same size in first dimension, " \
                    "got first ndarray of size {} and another {}".format(data_length, len(x))
                x_parts = np.array_split(x, shard_num)
                for idx, x_part in enumerate(x_parts):
                    data_to_be_shard[idx].append(x_part)

            data_to_be_shard = [
                nest.pack_sequence_as(data, shard)
                for shard in data_to_be_shard
            ]
            rdd = sc.parallelize(data_to_be_shard)

        data_shards = SparkXShards(rdd)
        return data_shards
Example #3
0
 def load_pickle(cls, path, minPartitions=None):
     """
     Load XShards from pickle files.
     :param path: The pickle file path/directory
     :param minPartitions: The minimum partitions for the XShards
     :return: SparkXShards object
     """
     sc = init_nncontext()
     return SparkXShards(sc.pickleFile(path, minPartitions))
Example #4
0
    def partition(data):
        """
        Partition local in memory data and form a SparkXShards
        :param data: np.ndarray, a tuple, list, dict of np.ndarray, or a nested structure
        made of tuple, list, dict with ndarray as the leaf value
        :return: a SparkXShards
        """
        sc = init_nncontext()
        node_num, core_num = get_node_and_core_number()
        total_core_num = node_num * core_num
        import numpy as np
        type_err_msg = """
The types supported in zoo.orca.data.XShards.partition are
1. np.ndarray
2. a tuple, list, dict of np.ndarray
3. nested structure made of tuple, list, dict with ndarray as the leaf value

But got data of type {}
        """.format(type(data))
        supported_types = {list, tuple, dict}
        if isinstance(data, np.ndarray):
            arrays = np.array_split(data, total_core_num)
            rdd = sc.parallelize(arrays)
        else:
            assert type(data) in supported_types, type_err_msg
            flattened = nest.flatten(data)
            data_length = len(flattened[0])
            data_to_be_shard = []
            for i in range(total_core_num):
                data_to_be_shard.append([])
            for x in flattened:
                assert len(x) == data_length, \
                    "the ndarrays in data must all have the same size in first dimension, " \
                    "got first ndarray of size {} and another {}".format(data_length, len(x))
                x_parts = np.array_split(x, total_core_num)
                for idx, x_part in enumerate(x_parts):
                    data_to_be_shard[idx].append(x_part)

            data_to_be_shard = [
                nest.pack_sequence_as(data, shard)
                for shard in data_to_be_shard
            ]
            rdd = sc.parallelize(data_to_be_shard)

        data_shards = SparkXShards(rdd)
        return data_shards
Example #5
0
def predict(model_path, img_path):
    model = InferenceModel()
    model.load_openvino(model_path,
                        weight_path=model_path[:model_path.rindex(".")] +
                        ".bin",
                        batch_size=BATCH_SIZE)
    sc = init_nncontext("OpenVINO Python resnet_v1_50 Inference Example")
    # pre-processing
    infer_transformer = ChainedPreprocessing([
        ImageBytesToMat(),
        ImageResize(256, 256),
        ImageCenterCrop(224, 224),
        ImageMatToTensor(format="NHWC", to_RGB=True)
    ])
    image_set = ImageSet.read(img_path, sc).\
        transform(infer_transformer).get_image().collect()
    image_set = np.expand_dims(image_set, axis=1)

    for i in range(len(image_set) // BATCH_SIZE + 1):
        index = i * BATCH_SIZE
        # check whether out of index
        if index >= len(image_set):
            break
        batch = image_set[index]
        # put 4 images in one batch
        for j in range(index + 1, min(index + BATCH_SIZE, len(image_set))):
            batch = np.vstack((batch, image_set[j]))
        batch = np.expand_dims(batch, axis=0)
        # predict batch
        predictions = model.predict(batch)
        result = predictions[0]

        # post-processing for Top-1
        print("batch_" + str(i))
        for r in result:
            output = {}
            max_index = np.argmax(r)
            output["Top-1"] = str(max_index)
            print("* Predict result " + str(output))
    print("finished...")
    sc.stop()
Example #6
0
    def read_df(esConfig, esResource, schema=None):
        """
        Read the data from elastic search into DataFrame.
        :param esConfig Dictionary which represents configuration for
               elastic search(eg. ip, port etc).
        :param esResource resource file in elastic search.
        :param schema Optional. Defines the schema of Spark dataframe.
                If each column in Es is single value, don't need set schema.
        :return Spark DataFrame. Each row represents a document in ES.
        """
        sc = init_nncontext()
        sqlContext = SQLContext.getOrCreate(sc)
        spark = sqlContext.sparkSession

        reader = spark.read_df.format("org.elasticsearch.spark.sql")

        for key in esConfig:
            reader.option(key, esConfig[key])
        if schema:
            reader.schema(schema)

        df = reader.load(esResource)
        return df
Example #7
0
def predict(model_path, image_path, top_n):
    sc = init_nncontext(
        "Image classification inference example using int8 quantized model")
    images = ImageSet.read(image_path, sc, image_codec=1)
    model = ImageClassifier.load_model(model_path)
    output = model.predict_image_set(images)
    label_map = model.get_config().label_map()

    # list of images composing uri and results in tuple format
    predicts = output.get_predict().collect()

    sequential = Sequential()
    sequential.add(Activation("softmax", input_shape=predicts[0][1][0].shape))
    for pre in predicts:
        (uri, probs) = pre
        out = sequential.forward(probs[0])
        sortedProbs = [(prob, index) for index, prob in enumerate(out)]
        sortedProbs.sort()
        print("Image : %s, top %d prediction result" % (uri, top_n))
        for i in range(top_n):
            print(
                "\t%s, %f" %
                (label_map[sortedProbs[999 - i][1]], sortedProbs[999 - i][0]))
Example #8
0
# limitations under the License.
#

from bigdl.optim.optimizer import Adam
from keras.datasets import imdb
from keras.preprocessing import sequence
from zoo.pipeline.api.keras.models import Model
from zoo.pipeline.api.keras.layers import *
from zoo.pipeline.api.autograd import *
from zoo.common.nncontext import init_spark_conf
from zoo.common.nncontext import init_nncontext

conf = init_spark_conf()
conf.set("spark.executor.extraJavaOptions", "-Xss512m")
conf.set("spark.driver.extraJavaOptions", "-Xss512m")
sc = init_nncontext(conf)
max_features = 20000
max_len = 200

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=max_len)
x_test = sequence.pad_sequences(x_test, maxlen=max_len)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

xmb = np.zeros((len(x_train), max_len, 2), dtype=np.int32)
Example #9
0
from zoo.pipeline.api.keras.layers import *
from zoo.models.recommendation import UserItemFeature
from zoo.models.recommendation import NeuralCF
from zoo.common.nncontext import init_nncontext
import matplotlib
from sklearn import metrics
from operator import itemgetter
from bigdl.dataset import movielens
from bigdl.util.common import *

sc = init_nncontext("NCF Example")
movielens_data = movielens.get_id_ratings("/tmp/movielens/")
min_user_id = np.min(movielens_data[:,0])
max_user_id = np.max(movielens_data[:,0])
min_movie_id = np.min(movielens_data[:,1])
max_movie_id = np.max(movielens_data[:,1])
rating_labels= np.unique(movielens_data[:,2])

print(movielens_data.shape)
print(min_user_id, max_user_id, min_movie_id, max_movie_id, rating_labels)

def build_sample(user_id, item_id, rating):
    sample = Sample.from_ndarray(np.array([user_id, item_id]), np.array([rating]))
    return UserItemFeature(user_id, item_id, sample)
pairFeatureRdds = sc.parallelize(movielens_data)\
    .map(lambda x: build_sample(x[0], x[1], x[2]-1))
pairFeatureRdds.take(3)
trainPairFeatureRdds, valPairFeatureRdds = pairFeatureRdds.randomSplit([0.8, 0.2], seed= 1)
valPairFeatureRdds.cache()
train_rdd= trainPairFeatureRdds.map(lambda pair_feature: pair_feature.sample)
val_rdd= valPairFeatureRdds.map(lambda pair_feature: pair_feature.sample)
Example #10
0
        sortedProbs.sort()
        print("Image : %s, top %d prediction result" % (uri, topN))
        for i in range(topN):
            print("\t%s, %f" %
                  (labelMap[sortedProbs[999 - i][1]], sortedProbs[999 - i][0]))


if __name__ == "__main__":
    parser = OptionParser()
    parser.add_option("-f",
                      "--folder",
                      type=str,
                      dest="img_path",
                      default=".",
                      help="Path where the images are stored")
    parser.add_option("--model",
                      type=str,
                      dest="model_path",
                      default="",
                      help="Path where the model is stored")
    parser.add_option("--topN",
                      type=int,
                      dest="topN",
                      default=1,
                      help="top N number")
    (options, args) = parser.parse_args(sys.argv)

    sc = init_nncontext("Image Classification Example")

    predict(options.model_path, options.img_path, options.topN)
Example #11
0
def predict(img_path):
    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(img_path, transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])),
        batch_size=8, shuffle=False,
        num_workers=1, pin_memory=True)

    model = models.resnet18(pretrained=True).eval()
    net = TorchNet.from_pytorch(model, [1, 3, 224, 224])

    for inputs, labels in val_loader:
        output = net.predict(inputs.numpy(), distributed=True).collect()
        index = [o.argmax() for o in output]
        print(index)


if __name__ == "__main__":
    parser = OptionParser()
    parser.add_option("--image", type=str, dest="img_path",
                      help="The path where the images are stored, "
                           "can be either a folder or an image path")
    (options, args) = parser.parse_args(sys.argv)

    sc = init_nncontext("Torch ResNet Prediction Example")
    predict(options.img_path)
Example #12
0
 def __init__(self, data):
     sc = init_nncontext()
     self.broadcast_data = sc.broadcast(data)
     self._value = None
Example #13
0
    parser.add_option("-l",
                      "--learning_rate",
                      dest="learning_rate",
                      default="0.01")
    parser.add_option("--log_dir", dest="log_dir", default="/tmp/.bigdl")
    parser.add_option("--model", dest="model")

    (options, args) = parser.parse_args(sys.argv)
    data_path = options.data_path
    token_length = int(options.token_length)
    sequence_len = int(options.sequence_length)
    max_words_num = int(options.max_words_num)
    training_split = float(options.training_split)
    batch_size = int(options.batch_size)

    sc = init_nncontext(
        create_spark_conf().setAppName("Text Classification Example"))

    print('Processing text dataset...')
    texts = get_news20(base_dir=data_path)
    text_data_rdd = sc.parallelize(texts, options.partition_num)

    word_meta = analyze_texts(text_data_rdd)
    # Remove the top 10 words roughly. You might want to fine tune this.
    word_meta = dict(word_meta[10:max_words_num])
    word_mata_broadcast = sc.broadcast(word_meta)

    word2vec = get_glove(base_dir=data_path, dim=token_length)
    # Ignore those unknown words.
    filtered_word2vec = dict(
        (w, v) for w, v in word2vec.items() if w in word_meta)
    filtered_word2vec_broadcast = sc.broadcast(filtered_word2vec)
Example #14
0
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import argparse
import cv2

from zoo.common.nncontext import init_nncontext
from zoo.models.image.objectdetection import *

sc = init_nncontext("Object Detection Example")

parser = argparse.ArgumentParser()
parser.add_argument('model_path', help="Path where the model is stored")
parser.add_argument('img_path', help="Path where the images are stored")
parser.add_argument('output_path', help="Path to store the detection results")
parser.add_argument("--partition_num",
                    type=int,
                    default=1,
                    help="The number of partitions")


def predict(model_path, img_path, output_path, partition_num):
    model = ObjectDetector.load_model(model_path)
    image_set = ImageSet.read(img_path,
                              sc,
Example #15
0
if __name__ == "__main__":
    parser = OptionParser()
    parser.add_option("--data_path", dest="data_path")
    parser.add_option("--embedding_file", dest="embedding_file")
    parser.add_option("--question_length", dest="question_length", default="10")
    parser.add_option("--answer_length", dest="answer_length", default="40")
    parser.add_option("--partition_num", dest="partition_num", default="4")
    parser.add_option("-b", "--batch_size", dest="batch_size", default="200")
    parser.add_option("-e", "--nb_epoch", dest="nb_epoch", default="30")
    parser.add_option("-l", "--learning_rate", dest="learning_rate", default="0.001")
    parser.add_option("-m", "--model", dest="model")
    parser.add_option("--output_path", dest="output_path")

    (options, args) = parser.parse_args(sys.argv)
    sc = init_nncontext("QARanker Example")

    q_set = TextSet.read_csv(options.data_path + "/question_corpus.csv",
                             sc, int(options.partition_num)).tokenize().normalize()\
        .word2idx(min_freq=2).shape_sequence(int(options.question_length))
    a_set = TextSet.read_csv(options.data_path+"/answer_corpus.csv",
                             sc, int(options.partition_num)).tokenize().normalize()\
        .word2idx(min_freq=2, existing_map=q_set.get_word_index())\
        .shape_sequence(int(options.answer_length))

    train_relations = Relations.read(options.data_path + "/relation_train.csv",
                                     sc, int(options.partition_num))
    train_set = TextSet.from_relation_pairs(train_relations, q_set, a_set)
    validate_relations = Relations.read(options.data_path + "/relation_valid.csv",
                                        sc, int(options.partition_num))
    validate_set = TextSet.from_relation_lists(validate_relations, q_set, a_set)
Example #16
0
    def predict(self, data, feature_cols=None, batch_size=4):
        """
        Predict input data

        :param batch_size: Int. Set batch Size, default is 4.
        :param data: data to be predicted. XShards, Spark DataFrame, numpy array and list of numpy
               arrays are supported. If data is XShards, each partition is a dictionary of  {'x':
               feature}, where feature(label) is a numpy array or a list of numpy arrays.
        :param feature_cols: Feature column name(s) of data. Only used when data is a Spark
               DataFrame. Default: None.
        :return: predicted result.
                 If the input data is XShards, the predict result is a XShards, each partition
                 of the XShards is a dictionary of {'prediction': result}, where the result is a
                 numpy array or a list of numpy arrays.
                 If the input data is numpy arrays or list of numpy arrays, the predict result is
                 a numpy array or a list of numpy arrays.
        """
        sc = init_nncontext()
        model_bytes_broadcast = sc.broadcast(self.model_bytes)
        weight_bytes_broadcast = sc.broadcast(self.weight_bytes)

        def partition_inference(partition):
            model_bytes = model_bytes_broadcast.value
            weight_bytes = weight_bytes_broadcast.value
            partition = list(partition)
            data_num = len(partition)
            ie = IECore()
            config = {'CPU_THREADS_NUM': str(self.core_num)}
            ie.set_config(config, 'CPU')
            net = ie.read_network(model=model_bytes,
                                  weights=weight_bytes,
                                  init_from_buffer=True)
            net.batch_size = batch_size
            local_model = ie.load_network(network=net,
                                          device_name="CPU",
                                          num_requests=data_num)
            inputs = list(iter(local_model.requests[0].input_blobs))
            outputs = list(iter(local_model.requests[0].output_blobs))
            assert len(
                outputs) != 0, "The number of model outputs should not be 0."

            def add_elem(d):
                d_len = len(d)
                if d_len < batch_size:
                    rep_time = [1] * (d_len - 1)
                    rep_time.append(batch_size - d_len + 1)
                    return np.repeat(d, rep_time, axis=0), d_len
                else:
                    return d, d_len

            results = []
            for idx, batch_data in enumerate(partition):
                infer_request = local_model.requests[idx]
                input_dict = dict()
                elem_num = 0
                if isinstance(batch_data, list):
                    for i, input in enumerate(inputs):
                        input_dict[input], elem_num = add_elem(batch_data[i])
                else:
                    input_dict[inputs[0]], elem_num = add_elem(batch_data)
                infer_request.infer(input_dict)
                if len(outputs) == 1:
                    results.append(infer_request.output_blobs[
                        outputs[0]].buffer[:elem_num])
                else:
                    results.append(
                        list(
                            map(
                                lambda output: infer_request.output_blobs[
                                    output].buffer[:elem_num], outputs)))

            return results

        def predict_transform(dict_data, batch_size):
            assert isinstance(dict_data, dict), "each shard should be an dict"
            assert "x" in dict_data, "key x should in each shard"
            feature_data = dict_data["x"]
            if isinstance(feature_data, np.ndarray):
                assert feature_data.shape[0] <= batch_size, \
                    "The batch size of input data (the second dim) should be less than the model " \
                    "batch size, otherwise some inputs will be ignored."
            elif isinstance(feature_data, list):
                for elem in feature_data:
                    assert isinstance(elem, np.ndarray), "Each element in the x list should be " \
                                                         "a ndarray, but get " + \
                                                         elem.__class__.__name__
                    assert elem.shape[0] <= batch_size, "The batch size of each input data (the " \
                                                        "second dim) should be less than the " \
                                                        "model batch size, otherwise some inputs " \
                                                        "will be ignored."
            else:
                raise ValueError(
                    "x in each shard should be a ndarray or a list of ndarray."
                )
            return feature_data

        if isinstance(data, DataFrame):
            from zoo.orca.learn.utils import dataframe_to_xshards, convert_predict_rdd_to_dataframe
            xshards, _ = dataframe_to_xshards(data,
                                              validation_data=None,
                                              feature_cols=feature_cols,
                                              label_cols=None,
                                              mode="predict")
            transformed_data = xshards.transform_shard(predict_transform,
                                                       batch_size)
            result_rdd = transformed_data.rdd.mapPartitions(
                lambda iter: partition_inference(iter))
            return convert_predict_rdd_to_dataframe(
                data, result_rdd.flatMap(lambda data: data))
        elif isinstance(data, SparkXShards):
            transformed_data = data.transform_shard(predict_transform,
                                                    batch_size)
            result_rdd = transformed_data.rdd.mapPartitions(
                lambda iter: partition_inference(iter))

            def update_result_shard(data):
                shard, y = data
                shard["prediction"] = y
                return shard

            return SparkXShards(
                data.rdd.zip(result_rdd).map(update_result_shard))
        elif isinstance(data, (np.ndarray, list)):
            if isinstance(data, np.ndarray):
                split_num = math.ceil(len(data) / batch_size)
                arrays = np.array_split(data, split_num)
                num_slices = min(split_num, self.node_num)
                data_rdd = sc.parallelize(arrays, numSlices=num_slices)
            elif isinstance(data, list):
                flattened = nest.flatten(data)
                data_length = len(flattened[0])
                data_to_be_rdd = []
                split_num = math.ceil(flattened[0].shape[0] / batch_size)
                num_slices = min(split_num, self.node_num)
                for i in range(split_num):
                    data_to_be_rdd.append([])
                for x in flattened:
                    assert isinstance(x, np.ndarray), "the data in the data list should be " \
                                                      "ndarrays, but get " + \
                                                      x.__class__.__name__
                    assert len(x) == data_length, \
                        "the ndarrays in data must all have the same size in first dimension" \
                        ", got first ndarray of size {} and another {}".format(data_length, len(x))
                    x_parts = np.array_split(x, split_num)
                    for idx, x_part in enumerate(x_parts):
                        data_to_be_rdd[idx].append(x_part)

                data_to_be_rdd = [
                    nest.pack_sequence_as(data, shard)
                    for shard in data_to_be_rdd
                ]
                data_rdd = sc.parallelize(data_to_be_rdd, numSlices=num_slices)

            print("Partition number: ", data_rdd.getNumPartitions())
            result_rdd = data_rdd.mapPartitions(
                lambda iter: partition_inference(iter))
            result_arr_list = result_rdd.collect()
            result_arr = None
            if isinstance(result_arr_list[0], list):
                result_arr = [
                    np.concatenate([r[i] for r in result_arr_list], axis=0)
                    for i in range(len(result_arr_list[0]))
                ]
            elif isinstance(result_arr_list[0], np.ndarray):
                result_arr = np.concatenate(result_arr_list, axis=0)
            return result_arr
        else:
            raise ValueError(
                "Only XShards, Spark DataFrame, a numpy array and a list of numpy arr"
                "ays are supported as input data, but get " +
                data.__class__.__name__)
Example #17
0
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import argparse
import cv2

from zoo.common.nncontext import init_nncontext
from zoo.models.image.objectdetection import *

sc = init_nncontext(create_spark_conf().setAppName("Object Detection Example"))

parser = argparse.ArgumentParser()
parser.add_argument('model_path', help="Path where the model is stored")
parser.add_argument('img_path', help="Path where the images are stored")
parser.add_argument('output_path',  help="Path to store the detection results")


def predict(model_path, img_path, output_path):
    model = ObjectDetector.load_model(model_path)
    image_set = ImageSet.read(img_path, sc)
    output = model.predict_image_set(image_set)

    config = model.get_config()
    visualizer = Visualizer(config.label_map(), encoding="jpg")
    visualized = visualizer(output).get_image(to_chw=False).collect()
Example #18
0
    df['hours'] = df['datetime'].dt.hour
    df['awake'] = (((df['hours'] >= awake_begin) & (df['hours'] <= awake_end))
                   | (df['hours'] == 0)).astype(int)
    return df


if __name__ == "__main__":
    parser = OptionParser()
    parser.add_option("-f",
                      type=str,
                      dest="file_path",
                      help="The file path to be read")

    (options, args) = parser.parse_args(sys.argv)

    sc = init_nncontext()

    # read data
    file_path = options.file_path
    data_shard = zoo.orca.data.pandas.read_csv(file_path, sc)
    data = data_shard.collect()

    # repartition
    data_shard = data_shard.repartition(2)

    # apply function on each element
    trans_data_shard = data_shard.transform_shard(process_feature)
    data2 = trans_data_shard.collect()

    sc.stop()
Example #19
0
 def load_pickle(cls, path, minPartitions=None):
     sc = init_nncontext()
     return SparkXShards(sc.pickleFile(path, minPartitions))
from zoo.models.recommendation import NeuralCF
from zoo.common.nncontext import init_nncontext
import matplotlib
from sklearn import metrics
from operator import itemgetter
from bigdl.nn.criterion import *
from bigdl.optim.optimizer import *
from bigdl.dataset import movielens
from bigdl.util.common import *

matplotlib.use('agg')
import matplotlib.pyplot as plt
%pylab inline

# Initilaize NN context, it will get a SparkContext with optimized configuration for BigDL performance.
sc = init_nncontext("NCF Example")

# Data Preparation
# Download and read movielens 1M data
movielens_data = movielens.get_id_ratings("hdfs:///user/leelau/zoo/recommendation-ncf/*")

# Understand the data. Each record is in format of (userid, movieid, rating_score). UserIDs range between 1 and 6040. MovieIDs range between 1 and 3952. Ratings are made on a 5-star scale (whole-star ratings only). Counts of users and movies are recorded for later use.
min_user_id = np.min(movielens_data[:,0])
max_user_id = np.max(movielens_data[:,0])
min_movie_id = np.min(movielens_data[:,1])
max_movie_id = np.max(movielens_data[:,1])
rating_labels= np.unique(movielens_data[:,2])

print(movielens_data.shape)
print(min_user_id, max_user_id, min_movie_id, max_movie_id, rating_labels)
from zoo.models.anomalydetection import AnomalyDetector
import pandas as pd
from pyspark.sql import SQLContext
from pyspark import sql
from optparse import OptionParser
import sys

if __name__ == "__main__":
    parser = OptionParser()
    parser.add_option("--input_dir", dest="input_dir")
    parser.add_option("-b", "--batch_size", dest="batch_size", default="1024")
    parser.add_option("--nb_epoch", dest="nb_epoch", default="20")
    parser.add_option("--unroll_len", dest="unroll_len", default="24")

    (options, args) = parser.parse_args(sys.argv)
    sc = init_nncontext("Anomaly Detection Example")

    sqlContext = sql.SQLContext(sc)

    def load_and_scale(input_path):
        df = pd.read_csv(input_path)
        df['datetime'] = pd.to_datetime(df['timestamp'])
        df['hours'] = df['datetime'].dt.hour
        df['awake'] = (((df['hours'] >= 6) & (df['hours'] <= 23)) |
                       (df['hours'] == 0)).astype(int)
        print(df.info())
        sqlContext = SQLContext(sc)
        dfspark = sqlContext.createDataFrame(df[["value", "hours", "awake"]])
        feature_size = len(["value", "hours", "awake"])
        return AnomalyDetector.standardScale(dfspark), feature_size
Example #22
0
    def predict(self, data, feature_cols=None):
        """
        Predict input data

        :param data: data to be predicted. XShards, Spark DataFrame, numpy array and list of numpy
               arrays are supported. If data is XShards, each partition is a dictionary of  {'x':
               feature}, where feature(label) is a numpy array or a list of numpy arrays.
        :param feature_cols: Feature column name(s) of data. Only used when data is a Spark
               DataFrame. Default: None.
        :return: predicted result.
                 If the input data is XShards, the predict result is a XShards, each partition
                 of the XShards is a dictionary of {'prediction': result}, where the result is a
                 numpy array or a list of numpy arrays.
                 If the input data is numpy arrays or list of numpy arrays, the predict result is
                 a numpy array or a list of numpy arrays.
        """
        from pyspark.sql import DataFrame

        def predict_transform(dict_data, batch_size):
            assert isinstance(dict_data, dict), "each shard should be an dict"
            assert "x" in dict_data, "key x should in each shard"
            feature_data = dict_data["x"]
            if isinstance(feature_data, np.ndarray):
                assert feature_data.shape[0] <= batch_size, \
                    "The batch size of input data (the second dim) should be less than the model " \
                    "batch size, otherwise some inputs will be ignored."
            elif isinstance(feature_data, list):
                for elem in feature_data:
                    assert isinstance(elem, np.ndarray), "Each element in the x list should be " \
                                                         "a ndarray, but get " + \
                                                         elem.__class__.__name__
                    assert elem.shape[0] <= batch_size, "The batch size of each input data (the " \
                                                        "second dim) should be less than the " \
                                                        "model batch size, otherwise some inputs " \
                                                        "will be ignored."
            else:
                raise ValueError(
                    "x in each shard should be a ndarray or a list of ndarray."
                )
            return feature_data

        sc = init_nncontext()

        if isinstance(data, DataFrame):
            from zoo.orca.learn.utils import dataframe_to_xshards, convert_predict_rdd_to_dataframe
            xshards, _ = dataframe_to_xshards(data,
                                              validation_data=None,
                                              feature_cols=feature_cols,
                                              label_cols=None,
                                              mode="predict")
            transformed_data = xshards.transform_shard(predict_transform,
                                                       self.batch_size)
            result_rdd = self.model.distributed_predict(
                transformed_data.rdd, sc)

            def delete_useless_result(data):
                shard, y = data
                data_length = len(shard["x"])
                return y[:data_length]

            result_rdd = xshards.rdd.zip(result_rdd).map(delete_useless_result)
            return convert_predict_rdd_to_dataframe(
                data, result_rdd.flatMap(lambda data: data))
        elif isinstance(data, SparkXShards):
            transformed_data = data.transform_shard(predict_transform,
                                                    self.batch_size)
            result_rdd = self.model.distributed_predict(
                transformed_data.rdd, sc)

            def update_shard(data):
                shard, y = data
                data_length = len(shard["x"])
                shard["prediction"] = y[:data_length]
                return shard

            return SparkXShards(data.rdd.zip(result_rdd).map(update_shard))
        elif isinstance(data, (np.ndarray, list)):
            if isinstance(data, np.ndarray):
                split_num = math.ceil(len(data) / self.batch_size)
                arrays = np.array_split(data, split_num)
                data_length_list = list(map(lambda arr: len(arr), arrays))
                data_rdd = sc.parallelize(arrays, numSlices=split_num)
            elif isinstance(data, list):
                flattened = nest.flatten(data)
                data_length = len(flattened[0])
                data_to_be_rdd = []
                split_num = math.ceil(flattened[0].shape[0] / self.batch_size)
                for i in range(split_num):
                    data_to_be_rdd.append([])
                for x in flattened:
                    assert isinstance(x, np.ndarray), "the data in the data list should be " \
                                                      "ndarrays, but get " + \
                                                      x.__class__.__name__
                    assert len(x) == data_length, \
                        "the ndarrays in data must all have the same size in first dimension" \
                        ", got first ndarray of size {} and another {}".format(data_length, len(x))
                    x_parts = np.array_split(x, split_num)
                    for idx, x_part in enumerate(x_parts):
                        data_to_be_rdd[idx].append(x_part)
                        data_length_list = list(
                            map(lambda arr: len(arr), x_part))

                data_to_be_rdd = [
                    nest.pack_sequence_as(data, shard)
                    for shard in data_to_be_rdd
                ]
                data_rdd = sc.parallelize(data_to_be_rdd, numSlices=split_num)

            result_rdd = self.model.distributed_predict(data_rdd, sc)
            result_arr_list = result_rdd.collect()
            for i in range(0, len(result_arr_list)):
                result_arr_list[i] = result_arr_list[i][:data_length_list[i]]
            result_arr = np.concatenate(result_arr_list, axis=0)
            return result_arr
        else:
            raise ValueError(
                "Only XShards, Spark DataFrame, a numpy array and a list of numpy arr"
                "ays are supported as input data, but get " +
                data.__class__.__name__)
Example #23
0
                      dest="encoder_output_dim",
                      default="256")
    parser.add_option("--training_split", dest="training_split", default="0.8")
    parser.add_option("-b", "--batch_size", dest="batch_size", default="128")
    parser.add_option("-e", "--nb_epoch", dest="nb_epoch", default="20")
    parser.add_option("-l",
                      "--learning_rate",
                      dest="learning_rate",
                      default="0.01")
    parser.add_option("--log_dir",
                      dest="log_dir",
                      default="/tmp/.analytics-zoo")
    parser.add_option("-m", "--model", dest="model")

    (options, args) = parser.parse_args(sys.argv)
    sc = init_nncontext("Text Classification Example")

    text_set = TextSet.read(path=options.data_path).to_distributed(
        sc, int(options.partition_num))
    print("Processing text dataset...")
    transformed = text_set.tokenize().normalize()\
        .word2idx(remove_topN=10, max_words_num=int(options.max_words_num))\
        .shape_sequence(len=int(options.sequence_length)).generate_sample()
    train_set, val_set = transformed.random_split(
        [float(options.training_split), 1 - float(options.training_split)])

    if options.model:
        model = TextClassifier.load_model(options.model)
    else:
        token_length = int(options.token_length)
        if not (token_length == 50 or token_length == 100
Example #24
0
    def predict(self, data, **kwargs):
        def predict_transform(dict_data, batch_size):
            assert isinstance(dict_data, dict), "each shard should be an dict"
            assert "x" in dict_data, "key x should in each shard"
            feature_data = dict_data["x"]
            if isinstance(feature_data, np.ndarray):
                assert feature_data.shape[1] <= batch_size, \
                    "The batch size of input data (the second dim) should be less than the model " \
                    "batch size, otherwise some inputs will be ignored."
            elif isinstance(feature_data, list):
                for elem in feature_data:
                    assert isinstance(elem, np.ndarray), "Each element in the x list should be " \
                                                         "a ndarray, but get " + \
                                                         elem.__class__.__name__
                    assert elem.shape[1] <= batch_size, "The batch size of each input data (the " \
                                                        "second dim) should be less than the " \
                                                        "model batch size, otherwise some inputs " \
                                                        "will be ignored."
            else:
                raise ValueError(
                    "x in each shard should be a ndarray or a list of ndarray."
                )
            return dict_data["x"]

        sc = init_nncontext()

        if isinstance(data, SparkXShards):
            assert sc is not None, "You should pass sc(spark context) if data is a XShards."
            from zoo.orca.learn.utils import convert_predict_to_xshard
            data = data.transform_shard(predict_transform, self.batch_size)
            result_rdd = self.model.distributed_predict(data.rdd, sc)
            return convert_predict_to_xshard(result_rdd)
        elif isinstance(data, (np.ndarray, list)):
            total_core_num = self.core_num * self.node_num
            if isinstance(data, np.ndarray):
                assert data.shape[1] <= self.batch_size, "The batch size of input data (the " \
                                                         "second dim) should be less than the " \
                                                         "model batch size, otherwise some " \
                                                         "inputs will be ignored."
                split_num = min(total_core_num, data.shape[0])
                arrays = np.array_split(data, split_num)
                data_rdd = sc.parallelize(arrays, numSlices=split_num)
            elif isinstance(data, list):
                flattened = nest.flatten(data)
                data_length = len(flattened[0])
                data_to_be_rdd = []
                split_num = min(total_core_num, flattened[0].shape[0])
                for i in range(split_num):
                    data_to_be_rdd.append([])
                for x in flattened:
                    assert isinstance(x, np.ndarray), "the data in the data list should be " \
                                                      "ndarrays, but get " + \
                                                      x.__class__.__name__
                    assert len(x) == data_length, \
                        "the ndarrays in data must all have the same size in first dimension" \
                        ", got first ndarray of size {} and another {}".format(data_length, len(x))
                    assert x.shape[1] <= self.batch_size, "The batch size of each input data (" \
                                                          "the second dim) should be less than " \
                                                          "the model batch size, otherwise some " \
                                                          "inputs will be ignored."
                    x_parts = np.array_split(x, split_num)
                    for idx, x_part in enumerate(x_parts):
                        data_to_be_rdd[idx].append(x_part)

                data_to_be_rdd = [
                    nest.pack_sequence_as(data, shard)
                    for shard in data_to_be_rdd
                ]
                data_rdd = sc.parallelize(data_to_be_rdd, numSlices=split_num)

            result_rdd = self.model.distributed_predict(data_rdd, sc)
            result_arr_list = result_rdd.collect()
            result_arr = np.concatenate(result_arr_list, axis=0)
            return result_arr
        else:
            raise ValueError(
                "Only XShards, a numpy array and a list of numpy arrays are supported "
                "as input data, but get " + data.__class__.__name__)
    parser.add_option("--training_split", dest="training_split", default="0.8")
    parser.add_option("-b", "--batch_size", dest="batch_size", default="128")
    parser.add_option("--nb_epoch", dest="nb_epoch", default="20")
    parser.add_option("-l", "--learning_rate", dest="learning_rate", default="0.01")
    parser.add_option("--log_dir", dest="log_dir", default="/tmp/.bigdl")
    parser.add_option("--model", dest="model")

    (options, args) = parser.parse_args(sys.argv)
    data_path = options.data_path
    token_length = int(options.token_length)
    sequence_len = int(options.sequence_length)
    max_words_num = int(options.max_words_num)
    training_split = float(options.training_split)
    batch_size = int(options.batch_size)

    sc = init_nncontext(create_spark_conf().setAppName("Text Classification Example"))

    print('Processing text dataset...')
    texts = get_news20(base_dir=data_path)
    text_data_rdd = sc.parallelize(texts, options.partition_num)

    word_meta = analyze_texts(text_data_rdd)
    # Remove the top 10 words roughly. You might want to fine tune this.
    word_meta = dict(word_meta[10: max_words_num])
    word_mata_broadcast = sc.broadcast(word_meta)

    word2vec = get_glove(base_dir=data_path, dim=token_length)
    # Ignore those unknown words.
    filtered_word2vec = dict((w, v) for w, v in word2vec.items() if w in word_meta)
    filtered_word2vec_broadcast = sc.broadcast(filtered_word2vec)
Example #26
0
if __name__ == "__main__":
    parser = OptionParser()
    parser.add_option("--image",
                      type=str,
                      dest="img_path",
                      help="The path where the images are stored, "
                      "can be either a folder or an image path")
    parser.add_option("--model",
                      type=str,
                      dest="model_path",
                      help="Path to the TensorFlow model file")

    (options, args) = parser.parse_args(sys.argv)

    sc = init_nncontext("OpenVINO Object Detection Inference Example")
    images = ImageSet.read(options.img_path,
                           sc,
                           resize_height=600,
                           resize_width=600).get_image().collect()
    input_data = np.concatenate(
        [image.reshape((1, 1) + image.shape) for image in images], axis=0)
    model_path = options.model_path
    model = InferenceModel()
    model.load_openvino(model_path,
                        weight_path=model_path[:model_path.rindex(".")] +
                        ".bin")
    predictions = model.predict(input_data)
    # Print the detection result of the first image.
    print(predictions[0])
Example #27
0
    # Transpose TensorFlow NHWC format to Analytics Zoo NCHW format.
    model.add(Transpose([(2, 4), (2, 3)]))
    model.add(Contiguous())
    model.add(detector)
    # Select the detection_boxes from the output.
    model.add(SelectTable(2))
    image_set = ImageSet.read(img_path, sc, partition_num)
    transformer = ChainedPreprocessing([ImageResize(256, 256), ImageMatToTensor(),
                                        ImageSetToSample()])
    transformed_image_set = image_set.transform(transformer)
    output = model.predict_image(transformed_image_set.to_image_frame(), batch_per_partition=1)
    # Print the detection box with the highest score of the first prediction result.
    result = output.get_predict().first()
    print(result[1][0])


if __name__ == "__main__":
    parser = OptionParser()
    parser.add_option("--image", type=str, dest="img_path",
                      help="The path where the images are stored, "
                           "can be either a folder or an image path")
    parser.add_option("--model", type=str, dest="model_path",
                      help="The path of the TensorFlow object detection model")
    parser.add_option("--partition_num", type=int, dest="partition_num", default=4,
                      help="The number of partitions")
    (options, args) = parser.parse_args(sys.argv)

    sc = init_nncontext("TFNet Object Detection Example")

    predict(options.model_path, options.img_path, options.partition_num)
Example #28
0
from zoo.common.nncontext import init_nncontext
from zoo.feature.image import *
import cv2
import numpy as np
from IPython.display import Image, display
sc = init_nncontext("Image Augmentation Example")

# create LocalImageSet from an image
local_image_set = ImageSet.read("/home/cdsw/image-augmentation/image/test.jpg")

# create LocalImageSet from an image folder
local_image_set = ImageSet.read("/home/cdsw/image-augmentation/image")

# create LocalImageSet from list of images
image = cv2.imread("/home/cdsw/image-augmentation/image/test.jpg")
local_image_set = LocalImageSet([image])

print(local_image_set.get_image())
print('isDistributed: ', local_image_set.is_distributed(), ', isLocal: ', local_image_set.is_local())

# create DistributedImageSet from an image
distributed_image_set = ImageSet.read("/home/cdsw/image-augmentation/image/test.jpg", sc, 2)

# create DistributedImageSet from an image folder
distributed_image_set = ImageSet.read("/home/cdsw/image-augmentation/image/", sc, 2)

# create LocalImageSet from image rdd
image = cv2.imread("/home/cdsw/image-augmentation/image/test.jpg")
image_rdd = sc.parallelize([image], 2)
label_rdd = sc.parallelize([np.array([1.0])], 2)
distributed_image_set = DistributedImageSet(image_rdd, label_rdd)