Example #1
0
def CreateTestData(uri):
    writer = py_pert.StringTableWriter()
    writer.Open(uri, 4)

    for i in range(10000):
        d = '%05d' % i
        writer.Add(d, d)
    return
Example #2
0
    def Run(self):
        reader = py_pert.StringTableShardSetReader()
        reader.Open(self.GetInput('images').GetUri())
        image_ids = []
        for i, (k, v) in enumerate(reader):
            image_ids.append(ParseUint64Key(k))

        LOG(INFO, 'creating match groups')
        match_groups = []  # a list of tuples (primary_id, secondary id list)
        widgets = [Percentage(), ' ', Bar(), ' ', ETA()]
        pbar = ProgressBar(widgets=widgets, maxval=len(image_ids)).start()
        for i in range(len(image_ids)):
            primary_id = image_ids[i]
            secondary_ids = list(image_ids)
            secondary_ids.remove(primary_id)
            for secondary_id_chunk in chunks(secondary_ids,
                                             self.max_batch_size):
                match_groups.append((primary_id, secondary_id_chunk))
            pbar.update(i)

        # write out the match plan (must be later sorted by key for future join stage)
        writer = py_pert.StringTableWriter()
        options = py_pert.WriterOptions()
        options.SetUnsorted()
        LOG(INFO, 'writing match groups')
        CHECK(
            writer.Open(
                self.GetOutput('unsorted_match_batches').GetUri(), 1, options))
        metadata = iw_pb2.MatchBatchMetadata()
        pbar = ProgressBar(widgets=widgets, maxval=len(match_groups)).start()
        for batch_id, (batch_primary_image,
                       batch_image_ids) in enumerate(match_groups):
            if len(batch_image_ids) == 0:
                continue

            batch_name = py_base.Uint64ToKey(batch_id)
            metadata = iw_pb2.MatchBatchMetadata()
            metadata.image_id = batch_primary_image
            metadata.batch_name = batch_name
            metadata.is_primary = True
            writer.Add(py_base.Uint64ToKey(metadata.image_id),
                       metadata.SerializeToString())

            for image_id in batch_image_ids:
                metadata.image_id = image_id
                metadata.batch_name = batch_name
                metadata.is_primary = False
                writer.Add(py_base.Uint64ToKey(metadata.image_id),
                           metadata.SerializeToString())

            pbar.update(batch_id)

        return
Example #3
0
def test_string_table():
    filename = "local:///home/ubuntu/Desktop/test_string_table"

    person = test_pb2.Person()
    person.first_name = 'foo'
    person.last_name = 'bar'

    writer = pert.StringTableWriter()
    writer.Open(filename, 1)
    writer.Add('key1', person.SerializeToString())
    writer.Add('key2', person.SerializeToString())
    writer.Close()

    reader = pert.StringTableReader()
    reader.Open(filename)

    for k, v in reader:
        my_person = test_pb2.Person()
        my_person.ParseFromString(v)
        print "key %s value %s" % (k, my_person)

    return
Example #4
0
 def Run(self):
     uri = self.outputs['mg'].GetUri()
     writer = py_pert.StringTableWriter()
     writer.Open(uri, py_pert.WriterOptions("memcmp"))
     return
Example #5
0
def PackImagesDirectoryToPert(src_path, output_uri):
  CHECK(os.path.isdir(src_path), 'expected dir: %s' % src_path)
  
  # generate filename cache if it doesn't yet exist
  images_filename_cache = '%s/filename_cache.txt' % (src_path)
  
  # force regen
  #if os.path.exists(images_filename_cache):
  #  os.remove(images_filename_cache)
  
  if not os.path.exists(images_filename_cache):
    print 'creating filename cache'
    # get list of files
    filenames = glob.glob('%s/*.jpg' % src_path)
    filenames.sort()    
    filenames_file = open(images_filename_cache, 'w')
    for filename in filenames:
      filebase = os.path.basename(filename)
      filenames_file.write('%s\n' % filebase)
    filenames_file.close()
  else:
    print 'using existing filename cache'
  
  num_files = NumLinesInFile(images_filename_cache)  
  filenames_file = open(images_filename_cache, 'r')
  
  key_size_bytes = 32
  block_size_mb = 0.5
  num_shards = 10
  desired_bloom_error_rate = 0.005
  num_files_per_shard = long(float(num_files)/num_shards)
  
  num_blocks_per_shard = 4000
  index_bytes_per_block = key_size_bytes + 8*3
  index_bytes_per_shard = index_bytes_per_block * num_blocks_per_shard  
  
  sample_pos_keys, sample_neg_keys = GetSampleShardKeys(images_filename_cache, num_shards)
  
  num_bits_tuned = py_pert.TuneRequiredBits(sample_pos_keys,sample_neg_keys, desired_bloom_error_rate)*num_shards
  num_megabytes_tuned = BitsToMegabytes(num_bits_tuned)
  print 'num_megabytes_tuned: %f' % num_megabytes_tuned
  
  num_megabytes_active_blocks = block_size_mb * num_shards
  num_megabytes_indices = BytesToMegabytes(index_bytes_per_shard*num_shards)
  num_megabytes_bloom_filters = num_megabytes_tuned
  
  print 'num_megabytes_active_blocks: %f' % num_megabytes_active_blocks
  print 'num_megabytes_indices: %f' % num_megabytes_indices
  print 'num_megabytes_bloom_filters: %f' % num_megabytes_bloom_filters
  
  options = py_pert.WriterOptions()
  options.SetBlockSize(long(1048576 * block_size_mb)) # bytes per block
  options.SetSorted('memcmp')
  options.SetBloomFilterBitsPerShard(num_bits_tuned)
  writer = py_pert.StringTableWriter()
  writer.Open(output_uri, num_shards, options)
  
  widgets = ['Exporting to %s: ' % output_uri , Percentage(), ' ', Bar(),
           ' ', ETA(), ' ']
  pbar = ProgressBar(widgets=widgets, maxval=num_files).start()
  
  for i, filename in enumerate(filenames_file):
    filename = filename.strip()
    if len(filename) != 36:
      LOG(WARNING, 'skiping invalid hash format file: %s' % filename)
      continue
    
    # remove the '.jpg' bit
    hash_key = filename[:-4]
    data = open(src_path + "/" + filename).read()
    writer.Add(hash_key, data)
    pbar.update(i)
    
  pbar.finish()
    
  writer.Close()  
  return