def create_test_fixtures(cls): log = util.create_log() log.info("Creating bdd100k test fixtures ...") ZIPS_TO_COPY = (cls.telemetry_zip(), ) util.cleandir(cls.TEST_FIXTURE_DIR) for path in ZIPS_TO_COPY: util.copy_n_from_zip(path, cls.test_fixture(path), 10) # Videos: just copy the ones that have INFO data log.info("Copying videos ...") fws = util.ArchiveFileFlyweight.fws_from( cls.test_fixture(cls.telemetry_zip())) for fw in fws: if 'json' not in fw.name: continue relpath = InfoDataset.json_fname_to_video_fname(fw.name) relpath = relpath[len('bdd100k/info/'):] path = os.path.join(cls.video_dir(), relpath) dest = cls.test_fixture(path) util.mkdir(os.path.dirname(dest)) util.run_cmd('cp -v ' + path + ' ' + dest) log.info("... done copying videos.") # For testing, create a video that has no INFO dest = cls.test_fixture( os.path.join(cls.video_dir(), '100k', 'train', 'video_with_no_info.mov')) codec = 'h264' # Chrome will not play `png` movies video_bytes = testutils.VideoFixture(codec=codec).get_bytes() with open(dest, 'wc') as f: f.write(video_bytes) log.info("Wrote synth video to %s ..." % dest)
def download_all(cls): util.mkdir(cls.zip_path('')) for fname in cls.DATA_ZIPS: uri = cls.BASE_ZIP_URL + '/' + fname util.download(uri, cls.zip_path(fname), try_expand=False) for fname in cls.ANNO_ZIPS: uri = cls.BASE_ANNO_URL + '/' + fname util.download(uri, cls.zip_path(fname), try_expand=False)
def copy_vid(fw): vid_dest = os.path.join(dest_dir, fw.name) util.mkdir(os.path.dirname(vid_dest)) if dry_run: print "DRY RUN SKIPPED: " + f.name else: with open(vid_dest, 'wc') as f: f.write(fw.data)
def to_debug(self, fname=''): """Convenience for dumping an image to a place on disk where the user can view locally (e.g. using Apple Finder file preview, Ubuntu image browser, an nginx instance pointed at the folder, etc). FMI see conf.AU_CACHE_TMP """ if self.image_bytes == '': return None dest = os.path.join(conf.AU_CACHE_TMP, self.fname()) util.mkdir(conf.AU_CACHE_TMP) with open(dest, 'wb') as f: f.write(self.image_bytes) return dest
def use_tempdir(monkeypatch, test_tempdir): from au import util monkeypatch.setattr(conf, 'AU_CACHE', test_tempdir) monkeypatch.setattr(conf, 'AU_CACHE_TMP', os.path.join(test_tempdir, 'tmp')) monkeypatch.setattr(conf, 'AU_DATA_CACHE', os.path.join(test_tempdir, 'data')) monkeypatch.setattr(conf, 'AU_TABLE_CACHE', os.path.join(test_tempdir, 'tables')) monkeypatch.setattr(conf, 'AU_MODEL_CACHE', os.path.join(test_tempdir, 'models')) monkeypatch.setattr(conf, 'AU_TENSORBOARD_DIR', os.path.join(test_tempdir, 'tensorboard')) util.mkdir(test_tempdir) if not os.environ.get('AU_NO_DEL_TEST_TEMPDIR'): util.rm_rf(test_tempdir)
def save(self, dest=None): if not dest: fname = self.video.name + '.html' dest = os.path.join( self.video.viddataset.FIXTURES.video_debug_dir(), fname) util.mkdir(self.video.viddataset.FIXTURES.video_debug_dir()) video = self._gen_video_html() map_path = self._save_map_html(dest) plot_paths = self._save_plots(dest) # We'll embed relative paths in the HTML map_fname = os.path.basename(map_path) plot_fnames = map(os.path.basename, plot_paths) map_html = '' if map_fname: map_html = ( '<iframe width="40%%" height="40%%" src="%s"></iframe>' % map_fname) plots_html = ''.join( '<img src="%s" width="400px" object-fit="contain" />' % p for p in plot_fnames) PAGE = """ <html> <head></head> <body> <div height="40%%"> {video} {map} </div> <br> <div> {plots} </div> </body> </html> """ html = PAGE.format(video=video, map=map_html, plots=plots_html) with open(dest, 'wc') as f: f.write(html) util.log.info("Saved page to %s" % dest)
def write_to_pngs(rows, dest_root=None): dest_root = dest_root or conf.AU_DATA_CACHE util.log.info("Writing PNGs to %s ..." % dest_root) n = 0 for row in rows: dest_dir = os.path.join( dest_root, row.dataset or 'default_dataset', row.split or 'default_split') util.mkdir(dest_dir) fname = row.fname() dest = os.path.join(dest_dir, fname) with open(dest, 'wb') as f: f.write(row.image_bytes) n += 1 if n % 100 == 0: util.log.info("... write %s PNGs ..." % n) util.log.info("... wrote %s total PNGs to %s ." % (n, dest_root))
def _setup_indices(cls): import shelve if not os.path.exists(cls._index_file('')): ### ### Based upon _create_tf_record_from_coco_annotations() ### import json import pprint # From tensorflow/models from object_detection.utils import label_map_util zip_path = cls.FIXTURES.zip_path(cls.ZIP_FNAME) util.log.info("Building annotations index for %s ..." % zip_path) fws = util.ArchiveFileFlyweight.fws_from(zip_path) anno_fw = None for fw in fws: if cls.ANNO_FNAME in fw.name: anno_fw = fw assert anno_fw, \ "Could not find entry for %s in %s" % (cls.ANNO_FNAME, zip_path) util.log.info("... reading json ...") anno_data = json.loads(anno_fw.data) util.log.info("... json loaded ...") images = anno_data['images'] category_index = label_map_util.create_category_index( anno_data['categories']) category_index = dict( (str(k), v) for k, v in category_index.iteritems()) util.log.info("Have annotations index for %s images." % len(images)) util.log.info("Category index: \n\n%s" % pprint.pformat(category_index)) image_to_annos = {} if 'annotations' in anno_data: util.log.info("... Building image ID -> Annos ...") for anno in anno_data['annotations']: # NB: we must string-ify keys for `shelve` image_id = str(anno['image_id']) image_to_annos.setdefault(image_id, []) image_to_annos[image_id].append(anno) missing_anno_count = sum(1 for image in images if str(image['id']) not in image_to_annos) util.log.info("... %s images are missing annos ..." % missing_anno_count) util.log.info("... finished index for %s ." % zip_path) image_id_to_image = dict( (str(image['id']), image) for image in images) def dump_to_shelf(name, data): dest = cls._index_file(name) util.log.info("... saving %s to %s ..." % (name, dest)) import pickle d = shelve.open(dest, protocol=pickle.HIGHEST_PROTOCOL) d.update(data.iteritems()) d.close() # Keeping the below data in memory will OOM almost any reasonable box, # so we cache the data on disk. util.mkdir(cls._index_file('')) dump_to_shelf('image_id_to_image', image_id_to_image) dump_to_shelf('category_index', category_index) dump_to_shelf('image_to_annos', image_to_annos) util.log.info("Using indices in %s" % cls._index_file('')) cls._image_id_to_image = shelve.open( cls._index_file('image_id_to_image')) cls._category_index = shelve.open(cls._index_file('category_index')) cls._image_to_annos = shelve.open(cls._index_file('image_to_annos'))
def write_to_parquet( rows, dest_dir, rows_per_file=-1, partition_cols=DEFAULT_PQ_PARTITION_COLS, compression='lz4', spark=None): is_rdd, is_pyspark_df = False, False try: import pyspark.rdd import pyspark.sql is_rdd = isinstance(rows, pyspark.rdd.RDD) is_pyspark_df = isinstance(rows, pyspark.sql.dataframe.DataFrame) if is_pyspark_df: df = rows except ImportError: pass if is_rdd: assert spark is not None from pyspark.sql import Row # RDD[ImageRow] -> DataFrame[ImageRow] rows_rdd = rows.map(lambda r: Row(**r.to_dict())) df = spark.createDataFrame(rows_rdd) is_pyspark_df = True if is_pyspark_df: util.log.info("Writing parquet to %s ..." % dest_dir) df.printSchema() # NB: can't .show() b/c of binary data df.write.parquet( dest_dir, mode='append', partitionBy=partition_cols, compression=compression) util.log.info("... done! Wrote to %s ." % dest_dir) else: # Use Pyarrow to write Parquet in this process import pandas as pd import pyarrow as pa import pyarrow.parquet as pq log = create_log() if rows_per_file >= 1: irows = util.ichunked(rows, rows_per_file) else: rows = list(rows) if not rows: return irows = iter([rows]) util.log.info("Writing parquet to %s ..." % dest_dir) for row_chunk in irows: r = row_chunk[0] # Pandas wants dicts if isinstance(r, ImageRow): row_chunk = [r.to_dict() for r in row_chunk] df = pd.DataFrame(row_chunk) table = pa.Table.from_pandas(df) util.mkdir(dest_dir) pq.write_to_dataset( table, dest_dir, partition_cols=partition_cols, preserve_index=False, # Don't care about pandas index compression='snappy', # NB: pyarrow lz4 is totes broken https://github.com/apache/arrow/issues/3491 flavor='spark') util.log.info("... wrote %s rows ..." % len(row_chunk)) util.log.info("... done writing to %s ." % dest_dir)