def create_inference_graph(self, input_image, base_graph): log = util.create_log() with base_graph.as_default(): sess = util.tf_cpu_session() with sess.as_default(): tf_model = create_model() # Create ops and load weights # root = tf.train.Checkpoint(model=tf_model) # root.restore(tf.train.latest_checkpoint(self.params.MODEL_BASEDIR)) # log.info("Read model params from %s" % self.params.MODEL_BASEDIR) pred = tf_model(tf.cast(input_image, tf.float32), training=False) checkpoint = tf.train.latest_checkpoint( self.params.MODEL_BASEDIR) saver = tf.train.import_meta_graph(checkpoint + '.meta', clear_devices=True) self.graph = util.give_me_frozen_graph(checkpoint, nodes=self.output_names, saver=saver, base_graph=base_graph, sess=sess) import pprint log.info("Loaded graph:") log.info( pprint.pformat(tf.contrib.graph_editor.get_tensors(self.graph))) return self.graph
def create_test_fixtures(cls): log = util.create_log() log.info("Creating bdd100k test fixtures ...") ZIPS_TO_COPY = (cls.telemetry_zip(), ) util.cleandir(cls.TEST_FIXTURE_DIR) for path in ZIPS_TO_COPY: util.copy_n_from_zip(path, cls.test_fixture(path), 10) # Videos: just copy the ones that have INFO data log.info("Copying videos ...") fws = util.ArchiveFileFlyweight.fws_from( cls.test_fixture(cls.telemetry_zip())) for fw in fws: if 'json' not in fw.name: continue relpath = InfoDataset.json_fname_to_video_fname(fw.name) relpath = relpath[len('bdd100k/info/'):] path = os.path.join(cls.video_dir(), relpath) dest = cls.test_fixture(path) util.mkdir(os.path.dirname(dest)) util.run_cmd('cp -v ' + path + ' ' + dest) log.info("... done copying videos.") # For testing, create a video that has no INFO dest = cls.test_fixture( os.path.join(cls.video_dir(), '100k', 'train', 'video_with_no_info.mov')) codec = 'h264' # Chrome will not play `png` movies video_bytes = testutils.VideoFixture(codec=codec).get_bytes() with open(dest, 'wc') as f: f.write(video_bytes) log.info("Wrote synth video to %s ..." % dest)
def mnist_train(params): log = util.create_log() tf.logging.set_verbosity(tf.logging.DEBUG) ## Model model_dir = params.MODEL_BASEDIR tf.gfile.MakeDirs(params.MODEL_BASEDIR) mnist_classifier = tf.estimator.Estimator( model_fn=model_fn, params=None, config=tf.estimator.RunConfig( model_dir=model_dir, save_summary_steps=10, save_checkpoints_secs=10, session_config=util.tf_create_session_config(), log_step_count_steps=10)) ## Data def train_input_fn(): from official.mnist import dataset as mnist_dataset # Load the datasets train_ds = mnist_dataset.train(params.DATA_BASEDIR) if params.LIMIT >= 0: train_ds = train_ds.take(params.LIMIT) train_ds = train_ds.shuffle(60000).batch(params.BATCH_SIZE) return train_ds def eval_input_fn(): test_ds = test_dataset(params) # No idea why we return an interator thingy instead of a dataset ... return test_ds.make_one_shot_iterator().get_next() # Set up hook that outputs training logs every 100 steps. from official.utils.logs import hooks_helper train_hooks = hooks_helper.get_train_hooks( ['ExamplesPerSecondHook', 'LoggingTensorHook'], model_dir=model_dir, batch_size=params.BATCH_SIZE) # Train and evaluate model. for _ in range(params.TRAIN_EPOCHS): mnist_classifier.train(input_fn=train_input_fn, hooks=train_hooks) eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) log.info('\nEvaluation results:\n\t%s\n' % eval_results) # Export the model # TODO do we need this placeholder junk? image = tf.placeholder(tf.float32, [None, 28, 28, 1], name='input_image') input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({ 'image': image, }) mnist_classifier.export_savedmodel(params.MODEL_BASEDIR, input_fn)
def rows_from_images_dir(img_dir, pattern='*', **kwargs): import pathlib2 as pathlib log = create_log() log.info("Reading images from dir %s ..." % img_dir) paths = pathlib.Path(img_dir).glob(pattern) n = 0 for path in paths: path = str(path) # pathlib uses PosixPath thingies ... yield ImageRow.from_path(path, **kwargs) n += 1 if (n % 100) == 0: log.info("... read %s paths ..." % n) log.info("... read %s total paths." % n)
def setup(cls, spark=None): log = util.create_log() log.info("Building table %s ..." % cls.TABLE_NAME) spark = spark or util.Spark.getOrCreate() img_rdd = cls.IMAGE_TABLE_CLS.as_imagerow_rdd(spark) model = cls.NNMODEL_CLS.load_or_train(cls.MODEL_PARAMS) filler = FillActivationsTFDataset(model=model) activated = img_rdd.mapPartitions(filler) def to_activation_rows(imagerows): from pyspark.sql import Row for row in imagerows: if row.attrs is '': continue activations = row.attrs.get('activations') if not activations: continue for act in activations: for tensor_name, value in act._tensor_to_value.iteritems(): yield Row( model_name=model.params.MODEL_NAME, tensor_name=tensor_name, tensor_value=value, dataset=row.dataset, split=row.split, uri=row.uri, ) activation_row_rdd = activated.mapPartitions(to_activation_rows) df = spark.createDataFrame(activation_row_rdd) df.show() writer = df.write.parquet( path=cls.table_root(), mode='overwrite', compression='lz4', partitionBy=dataset.ImageRow.DEFAULT_PQ_PARTITION_COLS) log.info("... wrote to %s ." % cls.table_root())
def load_or_train(cls, params=None): log = util.create_log() params = params or MNIST.Params() model = MNIST(params=params) if not os.path.exists(os.path.join(params.MODEL_BASEDIR, 'model.ckpt')): log.info("Training!") # subprocess allows recovery of gpu memory! See TFSessionPool comments # import multiprocessing # p = multiprocessing.Process(target=mnist_train, args=(params,)) # p.start() # p.join() mnist_train(params) log.info("Done training!") model.igraph = MNISTGraph(params) return model
def datasets_iter_image_rows(cls, params=None): params = params or MNIST.Params() log = util.create_log() def gen_dataset(ds, split): import imageio import numpy as np n = 0 with util.tf_data_session(ds) as (sess, iter_dataset): for image, label in iter_dataset(): image = np.reshape(image * 255., (28, 28, 1)).astype(np.uint8) label = int(label) row = dataset.ImageRow.from_np_img_labels( image, label, dataset=cls.TABLE_NAME, split=split, uri='mnist_%s_%s' % (split, n)) yield row if params.LIMIT >= 0 and n == params.LIMIT: break n += 1 if n % 100 == 0: log.info("Read %s records from tf.Dataset" % n) from official.mnist import dataset as mnist_dataset # Keep our dataset ops in an isolated graph g = tf.Graph() with g.as_default(): gens = itertools.chain( gen_dataset(mnist_dataset.train(params.DATA_BASEDIR), 'train'), gen_dataset(mnist_dataset.test(params.DATA_BASEDIR), 'test')) for row in gens: yield row
def _create_egg(cls, src_root=None, tmp_path=None): """Build a Python Egg from the current project and return a path to the artifact. Why an Egg? `pyspark` supports zipfiles and egg files as Python artifacts. One might wish to use a wheel instead of an egg. See this excellent article and repo: * https://bytes.grubhub.com/managing-dependencies-and-artifacts-in-pyspark-7641aa89ddb7 * https://github.com/alekseyig/spark-submit-deps The drawbacks to using a wheel include: * wheels often require native libraries to be installed (e.g. via `apt-get`), and those deps are typically best baked into the Spark Worker environment (versus installed every job run). * The `BdistSpark` example above is actually rather slow, especially when Tensorflow is a dependency, and `BdistSpark` must run before every job is submitted. * Spark treats wheels as zip files and unzips them on every run; this unzip operation can be very expensive if the zipfile contains large binaries (e.g. tensorflow) In comparison, an Egg provides the main benefits we want (to ship project code, often pre-committed code, to workers). """ log = util.create_log() if tmp_path is None: import tempfile tempdir = tempfile.gettempdir() SUBDIR_NAME = 'au_eggs' tmp_path = os.path.join(tempdir, SUBDIR_NAME) util.cleandir(tmp_path) if src_root is None: log.info("Trying to auto-resolve path to src root ...") try: import inspect path = inspect.getfile(inspect.currentframe()) src_root = os.path.dirname(os.path.abspath(path)) except Exception as e: log.info( "Failed to auto-resolve src root, " "falling back to %s" % cls.SRC_ROOT) src_root = cls.SRC_ROOT src_root = '/opt/au' log.info("Using source root %s " % src_root) # Below is a programmatic way to run something like: # $ cd /opt/au && python setup.py clearn bdist_egg # Based upon https://github.com/pypa/setuptools/blob/a94ccbf404a79d56f9b171024dee361de9a948da/setuptools/tests/test_bdist_egg.py#L30 # See also: # * https://github.com/pypa/setuptools/blob/f52b3b1c976e54df7a70db42bf59ca283412b461/setuptools/dist.py # * https://github.com/pypa/setuptools/blob/46af765c49f548523b8212f6e08e1edb12f22ab6/setuptools/tests/test_sdist.py#L123 # * https://github.com/pypa/setuptools/blob/566f3aadfa112b8d6b9a1ecf5178552f6e0f8c6c/setuptools/__init__.py#L51 from setuptools.dist import Distribution from setuptools import PackageFinder MODNAME = os.path.split(src_root)[-1] dist = Distribution(attrs=dict( script_name='setup.py', script_args=[ 'clean', 'bdist_egg', '--dist-dir', tmp_path, '--bdist-dir', os.path.join(tmp_path, 'workdir'), ], name=MODNAME, src_root=src_root, packages=PackageFinder.find(where=src_root), )) log.info("Generating egg to %s ..." % tmp_path) with util.quiet(): dist.parse_command_line() dist.run_commands() egg_path = os.path.join(tmp_path, MODNAME + '-0.0.0-py2.7.egg') assert os.path.exists(egg_path) log.info("... done. Egg at %s" % egg_path) return egg_path
def write_to_parquet( rows, dest_dir, rows_per_file=-1, partition_cols=DEFAULT_PQ_PARTITION_COLS, compression='lz4', spark=None): is_rdd, is_pyspark_df = False, False try: import pyspark.rdd import pyspark.sql is_rdd = isinstance(rows, pyspark.rdd.RDD) is_pyspark_df = isinstance(rows, pyspark.sql.dataframe.DataFrame) if is_pyspark_df: df = rows except ImportError: pass if is_rdd: assert spark is not None from pyspark.sql import Row # RDD[ImageRow] -> DataFrame[ImageRow] rows_rdd = rows.map(lambda r: Row(**r.to_dict())) df = spark.createDataFrame(rows_rdd) is_pyspark_df = True if is_pyspark_df: util.log.info("Writing parquet to %s ..." % dest_dir) df.printSchema() # NB: can't .show() b/c of binary data df.write.parquet( dest_dir, mode='append', partitionBy=partition_cols, compression=compression) util.log.info("... done! Wrote to %s ." % dest_dir) else: # Use Pyarrow to write Parquet in this process import pandas as pd import pyarrow as pa import pyarrow.parquet as pq log = create_log() if rows_per_file >= 1: irows = util.ichunked(rows, rows_per_file) else: rows = list(rows) if not rows: return irows = iter([rows]) util.log.info("Writing parquet to %s ..." % dest_dir) for row_chunk in irows: r = row_chunk[0] # Pandas wants dicts if isinstance(r, ImageRow): row_chunk = [r.to_dict() for r in row_chunk] df = pd.DataFrame(row_chunk) table = pa.Table.from_pandas(df) util.mkdir(dest_dir) pq.write_to_dataset( table, dest_dir, partition_cols=partition_cols, preserve_index=False, # Don't care about pandas index compression='snappy', # NB: pyarrow lz4 is totes broken https://github.com/apache/arrow/issues/3491 flavor='spark') util.log.info("... wrote %s rows ..." % len(row_chunk)) util.log.info("... done writing to %s ." % dest_dir)