def main(): parser = get_parser() args = parser.parse_args() cores = args.executor_cores conf = (common.create_spark_conf().setAppName('pyspark-mnist').setMaster( args.master)) conf = conf.set('spark.executor.cores', cores) conf = conf.set('spark.cores.max', cores) conf.set("spark.jars", os.environ.get('BIGDL_JARS')) LOG.info('initialize with spark conf:') LOG.info(conf.getAll()) sc = pyspark.SparkContext(conf=conf) common.init_engine() model = layer.Model.loadModel(args.model_dir + "/model.pb", args.model_dir + "/model.bin") files = glob.glob(args.input + '/*.png') def mapper(x): image = imageio.imread('file://' + x).astype(np.float32).reshape( 1, 28, 28) / 255 return image dataRDD = sc.parallelize(files).map(mapper) predictRDD = dataRDD.map( lambda x: common.Sample.from_ndarray(x, np.array([2.0]))) counts = model.predict(predictRDD).map( lambda x: (np.argmax(x) + 1, 1)).reduceByKey(lambda a, b: a + b) for x in counts.collect(): LOG.info("%d count is %d", x[0], x[1]) sc.stop()
def _test(): import doctest from pyspark import SparkContext from bigdl.optim import optimizer from bigdl.util.common import init_engine from bigdl.util.common import create_spark_conf globs = optimizer.__dict__.copy() sc = SparkContext(master="local[4]", appName="test optimizer", conf=create_spark_conf()) init_engine() globs['sc'] = sc (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) if failure_count: exit(-1)
from bigdl.nn.criterion import MSECriterion from zoo.pipeline.nnframes import * from bigdl.util.common import init_engine, create_spark_conf from bigdl.nn.layer import * from bigdl.optim.optimizer import * from zoo.pipeline.nnframes import NNModel from pyspark.ml.feature import VectorAssembler from pyspark.sql import SparkSession cores = [1, 2, 3, 4] conf = create_spark_conf() \ .setAppName("Spark_Basic_Learning") \ .setMaster("local[4]") \ .set("spark.sql.warehouse.dir", "file:///C:/Spark/temp") \ .set("spark.sql.streaming.checkpointLocation", "file:///C:/Spark/checkpoint") \ .set("spark.sql.execution.arrow.enabled", "true") #.set("spark.sql.execution.arrow.maxRecordsPerBatch", "") # Utsav: Tweak only if memory limits are known. Default = 10,000 spark = SparkSession.builder \ .config(conf=conf) \ .getOrCreate() # Init Big DL Engine init_engine() df = spark.read.format("csv") \ .option("inferSchema", "true") \ .option("header", "true") \ .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZZ") \
def main(): parser = get_parser() args = parser.parse_args() # BATCH_SIZE must be multiple of <executor.cores>: # in this case multiple of 3: 3,6,9,12 etc. if args.batch_size % args.executor_cores != 0: raise RuntimeError( 'batch size must be multiple of <executor-cores> parameter!' ) cores = args.executor_cores batch_size = args.batch_size conf = ( common.create_spark_conf() .setAppName('pyspark-mnist') .setMaster(args.master) ) conf = conf.set('spark.executor.cores', cores) conf = conf.set('spark.cores.max', cores) conf.set("spark.jars",os.environ.get('BIGDL_JARS')) LOG.info('initialize with spark conf:') sc = pyspark.SparkContext(conf=conf) common.init_engine() LOG.info('initialize training RDD:') ##Files from kuberlab dataset files = glob.glob(os.environ.get('DATA_DIR')+'/train/*.png') LOG.info('Train size: %d',len(files)) def mapper(x): label = int(x.split('/')[-1].split('-')[-1][:-4])+1 image = imageio.imread('file://'+x).astype(np.float32).reshape(1,28,28)/255 return common.Sample.from_ndarray(image, label) train_rdd = sc.parallelize(files).map(mapper) opt = optimizer.Optimizer( model=build_model(10), training_rdd=train_rdd, criterion=criterion.ClassNLLCriterion(), optim_method=optimizer.SGD( learningrate=0.01, learningrate_decay=0.0002 ), end_trigger=optimizer.MaxEpoch(args.epoch), batch_size=batch_size ) trained_model = opt.optimize() LOG.info("training finished") LOG.info('saving model...') path = args.output_dir if not os.path.exists(path): os.makedirs(path) trained_model.saveModel( path + '/model.pb', path + '/model.bin', over_write=True ) client.update_task_info({'checkpoint_path': path,'model_path': path}) LOG.info('successfully saved!') files = glob.glob(os.environ.get('DATA_DIR')+'/test/*.png') LOG.info('Validation size: %d',len(files)) test_rdd = sc.parallelize(files).map(mapper) results = trained_model.evaluate(test_rdd, batch_size , [optimizer.Top1Accuracy()]) accuracy = results[0].result client.update_task_info({'test_accuracy': float(accuracy)}) sc.stop()
from bigdl.util.common import create_spark_conf from bigdl.util.common import JavaCreator from bigdl.util.common import Sample from vision.image3d.transformation import * import h5py from math import pi img_path = os.path.abspath(__file__ + "/../../resources/image_3d/a.mat") sample = h5py.File(img_path)['meniscus_im'] sample = np.array(sample) sample = Sample.from_ndarray(features=sample, label=np.array(-1)) # sample = np.expand_dims(sample,0) print(sample.features[0].shape) sc = SparkContext(appName="test", conf=create_spark_conf()) JavaCreator.set_creator_class( "com.intel.analytics.zoo.transform.vision.image3d.python.api.VisionPythonBigDL" ) init_engine() data_rdd = sc.parallelize([sample]) start_loc = [13, 80, 125] patch = [5, 40, 40] # end_loc = [17,119,164] crop = Crop(start=start_loc, patch_size=patch) crop_rdd = crop(data_rdd) crop_data = crop_rdd.first() print(crop_data.features[0].shape)