def main(): parser = get_parser() args = parser.parse_args() cores = args.executor_cores conf = (common.create_spark_conf().setAppName('pyspark-mnist').setMaster( args.master)) conf = conf.set('spark.executor.cores', cores) conf = conf.set('spark.cores.max', cores) conf.set("spark.jars", os.environ.get('BIGDL_JARS')) LOG.info('initialize with spark conf:') LOG.info(conf.getAll()) sc = pyspark.SparkContext(conf=conf) common.init_engine() model = layer.Model.loadModel(args.model_dir + "/model.pb", args.model_dir + "/model.bin") files = glob.glob(args.input + '/*.png') def mapper(x): image = imageio.imread('file://' + x).astype(np.float32).reshape( 1, 28, 28) / 255 return image dataRDD = sc.parallelize(files).map(mapper) predictRDD = dataRDD.map( lambda x: common.Sample.from_ndarray(x, np.array([2.0]))) counts = model.predict(predictRDD).map( lambda x: (np.argmax(x) + 1, 1)).reduceByKey(lambda a, b: a + b) for x in counts.collect(): LOG.info("%d count is %d", x[0], x[1]) sc.stop()
def perf(model_path, batch_size, iteration): batch_input = np.random.rand(batch_size, 3, 224, 224) single_input = np.random.rand(1, 3, 224, 224) init_engine() model = ImageClassifier.load_model(model_path) model.set_evaluate_status() for i in range(iteration): start = time.time_ns() model.forward(batch_input) time_used = time.time_ns() - start throughput = round(batch_size / (time_used / 10 ** 9), 2) print("Iteration:" + str(i) + ", batch " + str(batch_size) + ", takes " + str(time_used) + " ns" + ", throughput is " + str(throughput) + " imgs/sec") # mkldnn model would forward a fixed batch size. # Thus need a new model to test for latency. model2 = ImageClassifier.load_model(model_path) model2.set_evaluate_status() for i in range(iteration): start = time.time_ns() model.forward(single_input) latency = time.time_ns() - start print("Iteration:" + str(i) + ", latency for a single image is " + str(latency / 10 ** 6) + " ms")
def _test(): import doctest from pyspark import SparkContext from bigdl.optim import optimizer from bigdl.util.common import init_engine from bigdl.util.common import create_spark_conf globs = optimizer.__dict__.copy() sc = SparkContext(master="local[4]", appName="test optimizer", conf=create_spark_conf()) init_engine() globs['sc'] = sc (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) if failure_count: exit(-1)
def _test(): import doctest from pyspark import SparkContext from bigdl.optim import optimizer from bigdl.util.common import init_engine from bigdl.util.common import create_spark_conf globs = optimizer.__dict__.copy() sc = SparkContext(master="local[4]", appName="test optimizer", conf=create_spark_conf()) init_engine() globs['sc'] = sc (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) if failure_count: exit(-1)
conf = create_spark_conf() \ .setAppName("Spark_Basic_Learning") \ .setMaster("local[4]") \ .set("spark.sql.warehouse.dir", "file:///C:/Spark/temp") \ .set("spark.sql.streaming.checkpointLocation", "file:///C:/Spark/checkpoint") \ .set("spark.sql.execution.arrow.enabled", "true") #.set("spark.sql.execution.arrow.maxRecordsPerBatch", "") # Utsav: Tweak only if memory limits are known. Default = 10,000 spark = SparkSession.builder \ .config(conf=conf) \ .getOrCreate() # Init Big DL Engine init_engine() df = spark.read.format("csv") \ .option("inferSchema", "true") \ .option("header", "true") \ .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZZ") \ .load("../resources/datasets/dataset-1_converted.csv") assembler = VectorAssembler( inputCols=["processing-time", "carparkID"], outputCol="features") df = assembler.transform(df) df = df.withColumnRenamed('slotOccupancy','label')
def main(): parser = get_parser() args = parser.parse_args() # BATCH_SIZE must be multiple of <executor.cores>: # in this case multiple of 3: 3,6,9,12 etc. if args.batch_size % args.executor_cores != 0: raise RuntimeError( 'batch size must be multiple of <executor-cores> parameter!' ) cores = args.executor_cores batch_size = args.batch_size conf = ( common.create_spark_conf() .setAppName('pyspark-mnist') .setMaster(args.master) ) conf = conf.set('spark.executor.cores', cores) conf = conf.set('spark.cores.max', cores) conf.set("spark.jars",os.environ.get('BIGDL_JARS')) LOG.info('initialize with spark conf:') sc = pyspark.SparkContext(conf=conf) common.init_engine() LOG.info('initialize training RDD:') ##Files from kuberlab dataset files = glob.glob(os.environ.get('DATA_DIR')+'/train/*.png') LOG.info('Train size: %d',len(files)) def mapper(x): label = int(x.split('/')[-1].split('-')[-1][:-4])+1 image = imageio.imread('file://'+x).astype(np.float32).reshape(1,28,28)/255 return common.Sample.from_ndarray(image, label) train_rdd = sc.parallelize(files).map(mapper) opt = optimizer.Optimizer( model=build_model(10), training_rdd=train_rdd, criterion=criterion.ClassNLLCriterion(), optim_method=optimizer.SGD( learningrate=0.01, learningrate_decay=0.0002 ), end_trigger=optimizer.MaxEpoch(args.epoch), batch_size=batch_size ) trained_model = opt.optimize() LOG.info("training finished") LOG.info('saving model...') path = args.output_dir if not os.path.exists(path): os.makedirs(path) trained_model.saveModel( path + '/model.pb', path + '/model.bin', over_write=True ) client.update_task_info({'checkpoint_path': path,'model_path': path}) LOG.info('successfully saved!') files = glob.glob(os.environ.get('DATA_DIR')+'/test/*.png') LOG.info('Validation size: %d',len(files)) test_rdd = sc.parallelize(files).map(mapper) results = trained_model.evaluate(test_rdd, batch_size , [optimizer.Top1Accuracy()]) accuracy = results[0].result client.update_task_info({'test_accuracy': float(accuracy)}) sc.stop()