Ejemplo n.º 1
0
    def create_test_fixtures(cls):
        log = util.create_log()

        log.info("Creating bdd100k test fixtures ...")
        ZIPS_TO_COPY = (cls.telemetry_zip(), )

        util.cleandir(cls.TEST_FIXTURE_DIR)
        for path in ZIPS_TO_COPY:
            util.copy_n_from_zip(path, cls.test_fixture(path), 10)

        # Videos: just copy the ones that have INFO data
        log.info("Copying videos ...")
        fws = util.ArchiveFileFlyweight.fws_from(
            cls.test_fixture(cls.telemetry_zip()))
        for fw in fws:
            if 'json' not in fw.name:
                continue

            relpath = InfoDataset.json_fname_to_video_fname(fw.name)
            relpath = relpath[len('bdd100k/info/'):]
            path = os.path.join(cls.video_dir(), relpath)
            dest = cls.test_fixture(path)
            util.mkdir(os.path.dirname(dest))
            util.run_cmd('cp -v ' + path + ' ' + dest)
        log.info("... done copying videos.")

        # For testing, create a video that has no INFO
        dest = cls.test_fixture(
            os.path.join(cls.video_dir(), '100k', 'train',
                         'video_with_no_info.mov'))
        codec = 'h264'  # Chrome will not play `png` movies
        video_bytes = testutils.VideoFixture(codec=codec).get_bytes()
        with open(dest, 'wc') as f:
            f.write(video_bytes)
        log.info("Wrote synth video to %s ..." % dest)
Ejemplo n.º 2
0
 def download_all(cls):
     util.mkdir(cls.zip_path(''))
     for fname in cls.DATA_ZIPS:
         uri = cls.BASE_ZIP_URL + '/' + fname
         util.download(uri, cls.zip_path(fname), try_expand=False)
     for fname in cls.ANNO_ZIPS:
         uri = cls.BASE_ANNO_URL + '/' + fname
         util.download(uri, cls.zip_path(fname), try_expand=False)
Ejemplo n.º 3
0
 def copy_vid(fw):
     vid_dest = os.path.join(dest_dir, fw.name)
     util.mkdir(os.path.dirname(vid_dest))
     if dry_run:
         print "DRY RUN SKIPPED: " + f.name
     else:
         with open(vid_dest, 'wc') as f:
             f.write(fw.data)
Ejemplo n.º 4
0
 def to_debug(self, fname=''):
   """Convenience for dumping an image to a place on disk where the user can
   view locally (e.g. using Apple Finder file preview, Ubuntu
   image browser, an nginx instance pointed at the folder, etc).
   
   FMI see conf.AU_CACHE_TMP
   """
   if self.image_bytes == '':
     return None
   
   dest = os.path.join(conf.AU_CACHE_TMP, self.fname())
   util.mkdir(conf.AU_CACHE_TMP)
   with open(dest, 'wb') as f:
     f.write(self.image_bytes)
   return dest 
Ejemplo n.º 5
0
def use_tempdir(monkeypatch, test_tempdir):
    from au import util
    monkeypatch.setattr(conf, 'AU_CACHE', test_tempdir)
    monkeypatch.setattr(conf, 'AU_CACHE_TMP',
                        os.path.join(test_tempdir, 'tmp'))
    monkeypatch.setattr(conf, 'AU_DATA_CACHE',
                        os.path.join(test_tempdir, 'data'))
    monkeypatch.setattr(conf, 'AU_TABLE_CACHE',
                        os.path.join(test_tempdir, 'tables'))
    monkeypatch.setattr(conf, 'AU_MODEL_CACHE',
                        os.path.join(test_tempdir, 'models'))
    monkeypatch.setattr(conf, 'AU_TENSORBOARD_DIR',
                        os.path.join(test_tempdir, 'tensorboard'))

    util.mkdir(test_tempdir)
    if not os.environ.get('AU_NO_DEL_TEST_TEMPDIR'):
        util.rm_rf(test_tempdir)
Ejemplo n.º 6
0
    def save(self, dest=None):
        if not dest:
            fname = self.video.name + '.html'
            dest = os.path.join(
                self.video.viddataset.FIXTURES.video_debug_dir(), fname)
            util.mkdir(self.video.viddataset.FIXTURES.video_debug_dir())

        video = self._gen_video_html()
        map_path = self._save_map_html(dest)
        plot_paths = self._save_plots(dest)

        # We'll embed relative paths in the HTML
        map_fname = os.path.basename(map_path)
        plot_fnames = map(os.path.basename, plot_paths)

        map_html = ''
        if map_fname:
            map_html = (
                '<iframe width="40%%" height="40%%" src="%s"></iframe>' %
                map_fname)
        plots_html = ''.join(
            '<img src="%s" width="400px" object-fit="contain" />' % p
            for p in plot_fnames)

        PAGE = """
      <html>
      <head></head>
      <body>
        <div height="40%%">
          {video} {map}
        </div>
        <br>
        <div>
          {plots}
        </div>
      </body>
      </html>
      """

        html = PAGE.format(video=video, map=map_html, plots=plots_html)
        with open(dest, 'wc') as f:
            f.write(html)
        util.log.info("Saved page to %s" % dest)
Ejemplo n.º 7
0
 def write_to_pngs(rows, dest_root=None):
   dest_root = dest_root or conf.AU_DATA_CACHE
   
   util.log.info("Writing PNGs to %s ..." % dest_root)
   n = 0
   for row in rows:
     dest_dir = os.path.join(
                   dest_root,
                   row.dataset or 'default_dataset',
                   row.split or 'default_split')
     util.mkdir(dest_dir)
     
     fname = row.fname()
     
     dest = os.path.join(dest_dir, fname)
     with open(dest, 'wb') as f:
       f.write(row.image_bytes)
     
     n += 1
     if n % 100 == 0:
       util.log.info("... write %s PNGs ..." % n)
   util.log.info("... wrote %s total PNGs to %s ." % (n, dest_root))  
Ejemplo n.º 8
0
    def _setup_indices(cls):
        import shelve

        if not os.path.exists(cls._index_file('')):

            ###
            ### Based upon _create_tf_record_from_coco_annotations()
            ###

            import json
            import pprint

            # From tensorflow/models
            from object_detection.utils import label_map_util

            zip_path = cls.FIXTURES.zip_path(cls.ZIP_FNAME)
            util.log.info("Building annotations index for %s ..." % zip_path)

            fws = util.ArchiveFileFlyweight.fws_from(zip_path)
            anno_fw = None
            for fw in fws:
                if cls.ANNO_FNAME in fw.name:
                    anno_fw = fw
            assert anno_fw, \
              "Could not find entry for %s in %s" % (cls.ANNO_FNAME, zip_path)

            util.log.info("... reading json ...")
            anno_data = json.loads(anno_fw.data)
            util.log.info("... json loaded ...")

            images = anno_data['images']
            category_index = label_map_util.create_category_index(
                anno_data['categories'])
            category_index = dict(
                (str(k), v) for k, v in category_index.iteritems())

            util.log.info("Have annotations index for %s images." %
                          len(images))
            util.log.info("Category index: \n\n%s" %
                          pprint.pformat(category_index))

            image_to_annos = {}
            if 'annotations' in anno_data:
                util.log.info("... Building image ID -> Annos ...")
                for anno in anno_data['annotations']:
                    # NB: we must string-ify keys for `shelve`
                    image_id = str(anno['image_id'])
                    image_to_annos.setdefault(image_id, [])
                    image_to_annos[image_id].append(anno)

            missing_anno_count = sum(1 for image in images
                                     if str(image['id']) not in image_to_annos)
            util.log.info("... %s images are missing annos ..." %
                          missing_anno_count)

            util.log.info("... finished index for %s ." % zip_path)

            image_id_to_image = dict(
                (str(image['id']), image) for image in images)

            def dump_to_shelf(name, data):
                dest = cls._index_file(name)
                util.log.info("... saving %s to %s ..." % (name, dest))

                import pickle
                d = shelve.open(dest, protocol=pickle.HIGHEST_PROTOCOL)
                d.update(data.iteritems())
                d.close()

            # Keeping the below data in memory will OOM almost any reasonable box,
            # so we cache the data on disk.
            util.mkdir(cls._index_file(''))
            dump_to_shelf('image_id_to_image', image_id_to_image)
            dump_to_shelf('category_index', category_index)
            dump_to_shelf('image_to_annos', image_to_annos)

        util.log.info("Using indices in %s" % cls._index_file(''))
        cls._image_id_to_image = shelve.open(
            cls._index_file('image_id_to_image'))
        cls._category_index = shelve.open(cls._index_file('category_index'))
        cls._image_to_annos = shelve.open(cls._index_file('image_to_annos'))
Ejemplo n.º 9
0
  def write_to_parquet(
        rows,
        dest_dir,
        rows_per_file=-1,
        partition_cols=DEFAULT_PQ_PARTITION_COLS,
        compression='lz4',
        spark=None):
    
    is_rdd, is_pyspark_df = False, False
    try:
      import pyspark.rdd
      import pyspark.sql
      is_rdd = isinstance(rows, pyspark.rdd.RDD)
      is_pyspark_df = isinstance(rows, pyspark.sql.dataframe.DataFrame)
      if is_pyspark_df:
        df = rows
    except ImportError:
      pass
    
    if is_rdd:
      assert spark is not None
      from pyspark.sql import Row

      # RDD[ImageRow] -> DataFrame[ImageRow]
      rows_rdd = rows.map(lambda r: Row(**r.to_dict()))
      df = spark.createDataFrame(rows_rdd)
      is_pyspark_df = True
    
    if is_pyspark_df:
      util.log.info("Writing parquet to %s ..." % dest_dir)
      df.printSchema() # NB: can't .show() b/c of binary data
      df.write.parquet(
        dest_dir,
        mode='append',
        partitionBy=partition_cols,
        compression=compression)
      util.log.info("... done! Wrote to %s ." % dest_dir)
    
    else:

      # Use Pyarrow to write Parquet in this process

      import pandas as pd
      import pyarrow as pa
      import pyarrow.parquet as pq
      
      log = create_log()
      
      if rows_per_file >= 1:
        irows = util.ichunked(rows, rows_per_file)
      else:
        rows = list(rows)
        if not rows:
          return
        irows = iter([rows])
      
      util.log.info("Writing parquet to %s ..." % dest_dir)
      for row_chunk in irows:
        r = row_chunk[0]
        
        # Pandas wants dicts
        if isinstance(r, ImageRow):
          row_chunk = [r.to_dict() for r in row_chunk]

        df = pd.DataFrame(row_chunk)
        table = pa.Table.from_pandas(df)
        util.mkdir(dest_dir)
        pq.write_to_dataset(
              table,
              dest_dir,
              partition_cols=partition_cols,
              preserve_index=False, # Don't care about pandas index
              compression='snappy',
                # NB: pyarrow lz4 is totes broken https://github.com/apache/arrow/issues/3491
              flavor='spark')
        util.log.info("... wrote %s rows ..." % len(row_chunk))
      util.log.info("... done writing to %s ." % dest_dir)