Esempio n. 1
0
def main():
    parser = get_parser()
    args = parser.parse_args()

    cores = args.executor_cores
    conf = (common.create_spark_conf().setAppName('pyspark-mnist').setMaster(
        args.master))
    conf = conf.set('spark.executor.cores', cores)
    conf = conf.set('spark.cores.max', cores)
    conf.set("spark.jars", os.environ.get('BIGDL_JARS'))

    LOG.info('initialize with spark conf:')
    LOG.info(conf.getAll())
    sc = pyspark.SparkContext(conf=conf)
    common.init_engine()

    model = layer.Model.loadModel(args.model_dir + "/model.pb",
                                  args.model_dir + "/model.bin")

    files = glob.glob(args.input + '/*.png')

    def mapper(x):
        image = imageio.imread('file://' + x).astype(np.float32).reshape(
            1, 28, 28) / 255
        return image

    dataRDD = sc.parallelize(files).map(mapper)
    predictRDD = dataRDD.map(
        lambda x: common.Sample.from_ndarray(x, np.array([2.0])))
    counts = model.predict(predictRDD).map(
        lambda x: (np.argmax(x) + 1, 1)).reduceByKey(lambda a, b: a + b)
    for x in counts.collect():
        LOG.info("%d count is %d", x[0], x[1])

    sc.stop()
Esempio n. 2
0
def perf(model_path, batch_size, iteration):
    batch_input = np.random.rand(batch_size, 3, 224, 224)
    single_input = np.random.rand(1, 3, 224, 224)
    init_engine()

    model = ImageClassifier.load_model(model_path)
    model.set_evaluate_status()

    for i in range(iteration):
        start = time.time_ns()
        model.forward(batch_input)
        time_used = time.time_ns() - start
        throughput = round(batch_size / (time_used / 10 ** 9), 2)
        print("Iteration:" + str(i) +
              ", batch " + str(batch_size) +
              ", takes " + str(time_used) + " ns" +
              ", throughput is " + str(throughput) + " imgs/sec")

    # mkldnn model would forward a fixed batch size.
    # Thus need a new model to test for latency.
    model2 = ImageClassifier.load_model(model_path)
    model2.set_evaluate_status()

    for i in range(iteration):
        start = time.time_ns()
        model.forward(single_input)
        latency = time.time_ns() - start
        print("Iteration:" + str(i) +
              ", latency for a single image is " + str(latency / 10 ** 6) + " ms")
Esempio n. 3
0
def _test():
    import doctest
    from pyspark import SparkContext
    from bigdl.optim import optimizer
    from bigdl.util.common import init_engine
    from bigdl.util.common import create_spark_conf
    globs = optimizer.__dict__.copy()
    sc = SparkContext(master="local[4]", appName="test optimizer",
                      conf=create_spark_conf())
    init_engine()
    globs['sc'] = sc
    (failure_count, test_count) = doctest.testmod(globs=globs,
                                                  optionflags=doctest.ELLIPSIS)
    if failure_count:
        exit(-1)
Esempio n. 4
0
def _test():
    import doctest
    from pyspark import SparkContext
    from bigdl.optim import optimizer
    from bigdl.util.common import init_engine
    from bigdl.util.common import create_spark_conf
    globs = optimizer.__dict__.copy()
    sc = SparkContext(master="local[4]", appName="test optimizer",
                      conf=create_spark_conf())
    init_engine()
    globs['sc'] = sc
    (failure_count, test_count) = doctest.testmod(globs=globs,
                                                  optionflags=doctest.ELLIPSIS)
    if failure_count:
        exit(-1)
Esempio n. 5
0

conf = create_spark_conf() \
    .setAppName("Spark_Basic_Learning") \
    .setMaster("local[4]") \
    .set("spark.sql.warehouse.dir", "file:///C:/Spark/temp") \
    .set("spark.sql.streaming.checkpointLocation", "file:///C:/Spark/checkpoint") \
    .set("spark.sql.execution.arrow.enabled", "true")
    #.set("spark.sql.execution.arrow.maxRecordsPerBatch", "") # Utsav: Tweak only if memory limits are known. Default = 10,000

spark = SparkSession.builder \
    .config(conf=conf) \
    .getOrCreate()

# Init Big DL Engine
init_engine()

df = spark.read.format("csv") \
    .option("inferSchema", "true") \
    .option("header", "true") \
    .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZZ") \
    .load("../resources/datasets/dataset-1_converted.csv")

assembler = VectorAssembler(
    inputCols=["processing-time", "carparkID"],
    outputCol="features")

df = assembler.transform(df)

df = df.withColumnRenamed('slotOccupancy','label')
Esempio n. 6
0
def main():
    parser = get_parser()
    args = parser.parse_args()

    # BATCH_SIZE must be multiple of <executor.cores>:
    # in this case multiple of 3: 3,6,9,12 etc.
    if args.batch_size % args.executor_cores != 0:
        raise RuntimeError(
            'batch size must be multiple of <executor-cores> parameter!'
        )

    cores = args.executor_cores
    batch_size = args.batch_size
    conf = (
        common.create_spark_conf()
            .setAppName('pyspark-mnist')
            .setMaster(args.master)
    )
    conf = conf.set('spark.executor.cores', cores)
    conf = conf.set('spark.cores.max', cores)
    conf.set("spark.jars",os.environ.get('BIGDL_JARS'))

    LOG.info('initialize with spark conf:')
    sc = pyspark.SparkContext(conf=conf)
    common.init_engine()

    LOG.info('initialize training RDD:')

    ##Files from kuberlab dataset
    files = glob.glob(os.environ.get('DATA_DIR')+'/train/*.png')
    LOG.info('Train size: %d',len(files))
    def mapper(x):
        label = int(x.split('/')[-1].split('-')[-1][:-4])+1
        image = imageio.imread('file://'+x).astype(np.float32).reshape(1,28,28)/255
        return common.Sample.from_ndarray(image, label)
    train_rdd = sc.parallelize(files).map(mapper)

    opt = optimizer.Optimizer(
        model=build_model(10),
        training_rdd=train_rdd,
        criterion=criterion.ClassNLLCriterion(),
        optim_method=optimizer.SGD(
            learningrate=0.01, learningrate_decay=0.0002
        ),
        end_trigger=optimizer.MaxEpoch(args.epoch),
        batch_size=batch_size
    )
    trained_model = opt.optimize()
    LOG.info("training finished")
    LOG.info('saving model...')
    path = args.output_dir
    if not os.path.exists(path):
        os.makedirs(path)
    trained_model.saveModel(
        path + '/model.pb',
        path + '/model.bin',
        over_write=True
    )
    client.update_task_info({'checkpoint_path': path,'model_path': path})
    LOG.info('successfully saved!')
    files = glob.glob(os.environ.get('DATA_DIR')+'/test/*.png')
    LOG.info('Validation size: %d',len(files))
    test_rdd = sc.parallelize(files).map(mapper)
    results = trained_model.evaluate(test_rdd, batch_size , [optimizer.Top1Accuracy()])
    accuracy = results[0].result
    client.update_task_info({'test_accuracy': float(accuracy)})
    sc.stop()